# Numpy & Pandas Introduction

<font color='steelblue'>

<span style="font-family:Arial; font-size:1.6em;">
    <b>Pandas and Numpy Examples</b><br><br>
    Number of examples of using numpy and pandas libraries<br><br>
</span>
<span style="font-family:Arial; font-size:1.4em;">
    <b>Following examples are included in the processing:</b>
    <ol>
        <li>Using Numpy Arrays</li>
        <li>Pandas Series Object</li>
        <li>Pandas Dataframe</li>
        <li>Writing Dataframe to .csv file</li>
        <li>Reading .csv file into dataframe</li>
        <li>Exploring data in dataframe</li>
        <li>Basic Statistics on dataframe</li>
        <li>Applying a function to a column in DataFrame</li>
    </ol>    
</span>

</font>

In [None]:
import numpy as np
import pandas as pd

In [None]:
!python --version

# Numpy Examples

### Array of Rank 1

In [None]:
# Create Array of Rank 1
arr1 = np.array([1, 2, 3])   

print(f"type: {type(arr1)}")   

print("shape: {}".format(arr1.shape))            

print("Elements in array: {} {} {}".format(arr1[0], arr1[1], arr1[2]))

# change the value at index 0
arr1[0] = 4                 
print(arr1)

### Array Rank 2

In [None]:
# Create Array of Rank 2
arr2 = np.array([[1,2,3],[4,5,6]])

# print shape
print("shape: {}".format(arr2.shape))                  

# print some elements
print("specific elements: {} {} {}".format(arr2[0, 0], arr2[0, 1], arr2[1, 0]))

# print the array
print("Row 0: {} Row 1: {}".format(arr2[0], arr2[1]))

arr2

### Create Numpy Arrays

In [None]:
# Create 2x2 array initialize with zeros
a = np.zeros((2,2))
print("Array with zeros: \n{}\n".format(a))              
                      
# Create an 1x2 array of all ones
b = np.ones((1,2))    # Create an array of all ones
print("Array with ones: \n{}\n".format(b))

# Create a 2x2 constant array
c = np.full((2,2), 7)  
print("Array with constant values: \n{}\n".format(c))
                       
# Create an array filled with random values
d = np.random.random((2,2))  
print("Array with random number: \n{}\n".format(d))

e = np.random.randint(2, size = 5)
print("Random ints 0 and 1: {}".format(e))

f = np.random.randint(5, size = 10)
print("Random ints 0 and 1: {}".format(f))

### Numpy Array Indexing

In [None]:
# 3x4 array
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print("shape of a: {}".format(a.shape))

# Note use of f strings in python
print(f"a: \n{a}\n")

# Use slicing to pull out the subarray consisting of the first 2 rows
# and columns 1 and 2; b is the following array of shape (2, 2):
# [[2 3]
#  [6 7]]
b = a[:2, 1:3]

print(f"b: \n{b}\n")

# A slice of an array is a view into the same data, so modifying it
# will modify the original array.
print(f"row 0 col 1 value: {a[0, 1]}") 

# b[0, 0] is the same piece of data as a[0, 1]
b[0, 0] = 77 
print(f"for a row 0 col 1 value: {a[0, 1]}")   

# Pandas Examples

## Series object

In [None]:
series_obj = pd.Series([10,20,30,40,50])
series_obj

In [None]:
# index access
series_obj[0]

### Element-wise operations

In [None]:
series_ages = pd.Series([31,22,43,44,55])
series_ages

In [None]:
series_ages + series_ages

In [None]:
series_ages * 2

In [None]:
series_ages + 100

### Boolean selection

In [None]:
series_ages > 40

In [None]:
#boolean access
series_ages[series_ages > 40]

## DataFrame object

In [None]:
# create a DataFrame using dictionary (of Series objects)
data = {"Name": ["Tim Miller", "Ann Carter", "Ellen Lee", "Sam Carr", \
                 "Al Ball", "Carl Zee", "Sara Martin"], 
        "Gender": ["Male", "Female", "Female", "Male", \
                   "Male", "Male", "Female"],
        "Age": [32, 44, 21, 19, 45, 27, 39]}
df = pd.DataFrame(data)

#when using print(), the DataFrame does not display as an HTML table
# print(df)  
df

In [None]:
print(df)

### Dataframe operations

In [None]:
# show first 5 rows
df.head()

In [None]:
# show last 5 rows
df.tail()

In [None]:
# returns a column/Series object
df['Name']     # dictionary notation

In [None]:
df.Name     # attribute notation; Tab completion

In [None]:
# assignment by column (or add a column)
df["Birth Year"] = 1999
df

In [None]:
# assignment by column (or add a column)
# must match the length of the DataFrame
df["Married"] = ['Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No']     
df

## Selection and Filtering
### Column selection

In [None]:
x = np.arange(100).reshape(10,10)
print(type(x))

In [None]:
x

In [None]:
# create a new DataFrame
data = pd.DataFrame(np.arange(100).reshape(10,10), 
                    columns = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', \
                               'i', 'j'])
data

In [None]:
data['a']

In [None]:
# providing a list selects multiple columns
data[["a", "e", "j"]]

In [None]:
# define a particular order
data[["j", "e", "a"]]

### Row selection

In [None]:
# use slice syntax to select rows
data[:1]

In [None]:
data[5:9]

In [None]:
# boolean operation on column
data["j"] > 40

In [None]:
# boolean selection for all rows where column j > 40
data[data["j"] > 40]

### Row and Column selection with loc
<b>Allows you to select a subset of the rows and columns using the label/name of the row/column</b>

In [None]:
data

In [None]:
# loc implies the name/label of the row and column
# note with loc the index is inclusive
data.loc[:5, "b"]

In [None]:
# consecutive (loc selection is inclusive)
data.loc[6:, 'a':'e']

In [None]:
# not consecutive
data.loc[:, ['c', 'f', 'i']] 

### Row and Column selection with iloc
<b>Allows you to select a subset of the rows and columns using the integer/index position of the row/column</b>

In [None]:
# iloc is for integer/index selection  (iloc selection is exclusive)
data.iloc[:5, 2:5]

In [None]:
# gives you a row, assumes all of the columns
data.iloc[4]

In [None]:
# returns selections in the order listed
# rows, 5, 0 and 3 and columns 9, 5, 0
data.iloc[[5, 0, 3], [9, 5, 0]]  

## Write out the dataframe to .csv file

In [None]:
# to save your cleaned data to file
df.to_csv("new_filename.csv")

## Dataframe from a file

In [None]:
# Read csv file into a pandas dataframe
# Note there is no header column in this file so define it
iris_data = pd.read_csv("../datasets/iris.csv", 
                        names = ["sepal_l", "sepal_w", "petal_l", \
                                 "petal_w", "class"])

In [None]:
# default is show first 5 rows
iris_data.head()

In [None]:
iris_data.shape

In [None]:
iris_data.tail(10)

In [None]:
iris_data['class'].unique()

In [None]:
# Get rows 0, 50, 100
iris_data.iloc[[0,50,100]]

In [None]:
# For sepal_l < 5 or > 7 get sepal_l, petal_l and class
iris_data.loc[(iris_data["sepal_l"] > 7) | \
              (iris_data["sepal_l"] < 5), \
              [ 'sepal_l', 'petal_l', 'class']]

In [None]:
# Get count of unique values in column
iris_data['class'].value_counts()

In [None]:
# Get count for each column
iris_data.count()

# Basic Statistics

In [None]:
# get dataframe statistics
iris_data.describe()

In [None]:
iris_data.describe().transpose()

## Correlation Coefficients
<span style="font-family:times, serif; font-size:14pt; font-style:bold">
<ul>
<li>0:  two variables have no correlation</li>
<li>-1: two variables have negative correlation</li>
<li>1:  two variables have positive correlation</li>
</ul>
</span>

In [None]:
iris_data.corr()

## Convert the flower types to numbers

In [None]:
mapping = {'Iris-setosa' : 0, 'Iris-versicolor' : 1, 'Iris-virginica' : 2}
mapping

In [None]:
iris_data['TypesNum'] = iris_data['class'].map(mapping)

In [None]:
iris_data.head()

In [None]:
iris_data['TypesNum'].unique()

In [None]:
iris_data['TypesNum'].value_counts()