In [1]:
!pip install pandas
!pip install numpy

import pandas as pd
import numpy as np

[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# reading the Iris Dataset from UCI Machine Learning Repository
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
                names = ['sepal_width', 
                         'sepal_length', 
                         'petal_width', 
                         'petal_length', 
                         'flower_class'])

'''
data dictionary for replacing String flower_class names 
with numerical identifiers  - 0, 1, 2
'''
classes = {
    'Iris-setosa': 0,
    'Iris-versicolor': 1,
    'Iris-virginica' : 2
}

# replacing and updating the dataframe
df = df.replace({'flower_class' : classes})
df.head()

Unnamed: 0,sepal_width,sepal_length,petal_width,petal_length,flower_class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## pandas.dataframe.pipe() : Using functions to operate upon datasets

In [3]:
# say we wish to operate upon the first four columns of the dataframe
# slicing
new = df.iloc[:, 0:4]
new.head()

# example 1
# defining a function to take two arguments
# argument 1: column value
# argument 2: feed value
def add(element_1, element_2):
   return element_1 + element_2
  
# piping the arguments to call add function on each column value
# displaying top 5 results
# 2 has been added to every element in the dataframe 'new'
# original dataframe left unaffected
new.pipe(add, 2).head()

Unnamed: 0,sepal_width,sepal_length,petal_width,petal_length
0,7.1,5.5,3.4,2.2
1,6.9,5.0,3.4,2.2
2,6.7,5.2,3.3,2.2
3,6.6,5.1,3.5,2.2
4,7.0,5.6,3.4,2.2


In [4]:
# example 2
# multiplying every element by 10
def mult(element_1, element_2):
  return element_1 * element_2

new.pipe(mult, 10).head()

Unnamed: 0,sepal_width,sepal_length,petal_width,petal_length
0,51.0,35.0,14.0,2.0
1,49.0,30.0,14.0,2.0
2,47.0,32.0,13.0,2.0
3,46.0,31.0,15.0,2.0
4,50.0,36.0,14.0,2.0


## pandas.dataframe.apply()

In [5]:
# axis = 0 implies column wise
print("Minimum of values in each column:")
print(new.apply(np.min, axis = 0))
print("\nMaximum of values in each column:")
print(new.apply(np.max, axis = 0))
print("\nSum of values in each column:")
print(new.apply(np.sum, axis = 0))
print("\nMean of values in each column:")
print(new.apply(np.mean, axis = 0))

Minimum of values in each column:
sepal_width     4.3
sepal_length    2.0
petal_width     1.0
petal_length    0.1
dtype: float64

Maximum of values in each column:
sepal_width     7.9
sepal_length    4.4
petal_width     6.9
petal_length    2.5
dtype: float64

Sum of values in each column:
sepal_width     876.5
sepal_length    458.1
petal_width     563.8
petal_length    179.8
dtype: float64

Mean of values in each column:
sepal_width     5.843333
sepal_length    3.054000
petal_width     3.758667
petal_length    1.198667
dtype: float64


In [6]:
# axis = 0 implies row wise
# displaying only top 5 dataframe values
print("Minimum of values in each row:")
print(new.apply(np.min, axis = 1).head())
print("\nMaximum of values in each row:")
print(new.apply(np.max, axis = 1).head())
print("\nSum of values in each row:")
print(new.apply(np.sum, axis = 1).head())
print("\nMean of values in each row:")
print(new.apply(np.mean, axis = 1).head())

Minimum of values in each row:
0    0.2
1    0.2
2    0.2
3    0.2
4    0.2
dtype: float64

Maximum of values in each row:
0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
dtype: float64

Sum of values in each row:
0    10.2
1     9.5
2     9.4
3     9.4
4    10.2
dtype: float64

Mean of values in each row:
0    2.550
1    2.375
2    2.350
3    2.350
4    2.550
dtype: float64


In [7]:
# finding minimum of 'sepal_width' and 'sepal_length' for all rows
# displaying top 5 rows
new[['sepal_width', 'sepal_length']].apply(np.min, axis = 1).head()

0    3.5
1    3.0
2    3.2
3    3.1
4    3.6
dtype: float64

In [8]:
# finding minimum of 'sepal_width' and 'sepal_length' column wise
new[['sepal_width', 'sepal_length']].apply(np.min, axis = 0)

sepal_width     4.3
sepal_length    2.0
dtype: float64

## pandas.dataframe[<column_name>].map()

In [9]:
# lambda function fed to operate on every value in the column
# each value multiplied with 100
new['sepal_width'].map(lambda x: x*100).head()

0    510.0
1    490.0
2    470.0
3    460.0
4    500.0
Name: sepal_width, dtype: float64

## pandas.dataframe.applymap() : Apply any function to all values in a dataframe

In [10]:
# defining lambda function 
half = lambda x: x/2

# applymap(): a combination of 'map' and 'apply'
# map takes every element and apply operates on entire dataframe
new.applymap(half).head()

Unnamed: 0,sepal_width,sepal_length,petal_width,petal_length
0,2.55,1.75,0.7,0.1
1,2.45,1.5,0.7,0.1
2,2.35,1.6,0.65,0.1
3,2.3,1.55,0.75,0.1
4,2.5,1.8,0.7,0.1


## pandas.dataframe.groupby() : for aggregation

In [11]:
# we will use our original dataframe, df
# grouped the entire dataframe by column, 'flower_class'
# flower_class == 0 data put into group 0
# flower_class == 1 data put into group 1
# flower_class == 2 data put into group 2
# min() for every column belonging to each group calculated
df.groupby('flower_class').min()

Unnamed: 0_level_0,sepal_width,sepal_length,petal_width,petal_length
flower_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4.3,2.3,1.0,0.1
1,4.9,2.0,3.0,1.0
2,4.9,2.2,4.5,1.4


In [12]:
# we will use our original dataframe, df
# grouped the entire dataframe by column, 'flower_class'
# flower_class == 0 data put into group 0
# flower_class == 1 data put into group 1
# flower_class == 2 data put into group 2
# max() for every column belonging to each group calculated
df.groupby('flower_class').max()

Unnamed: 0_level_0,sepal_width,sepal_length,petal_width,petal_length
flower_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5.8,4.4,1.9,0.6
1,7.0,3.4,5.1,1.8
2,7.9,3.8,6.9,2.5


In [13]:
# we will use our original dataframe, df
# grouped the entire dataframe by column, 'flower_class'
# flower_class == 0 data put into group 0
# flower_class == 1 data put into group 1
# flower_class == 2 data put into group 2
# sum() for every column belonging to each group calculated
df.groupby('flower_class').sum()

Unnamed: 0_level_0,sepal_width,sepal_length,petal_width,petal_length
flower_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,250.3,170.9,73.2,12.2
1,296.8,138.5,213.0,66.3
2,329.4,148.7,277.6,101.3


In [14]:
# we will use our original dataframe, df
# grouped the entire dataframe by column, 'flower_class'
# flower_class == 0 data put into group 0
# flower_class == 1 data put into group 1
# flower_class == 2 data put into group 2
# mean() for every column belonging to each group calculated
df.groupby('flower_class').mean()

Unnamed: 0_level_0,sepal_width,sepal_length,petal_width,petal_length
flower_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5.006,3.418,1.464,0.244
1,5.936,2.77,4.26,1.326
2,6.588,2.974,5.552,2.026


## pandas.dataframe.rename()

In [15]:
df = df.rename(index = str, columns = {
                                        'sepal_width': 's_w', 
                                        'sepal_length': 's_l', 
                                        'petal_length': 'p_l', 
                                        'petal_width': 'p_w', 
                                        'flower_class': 'f_c'
                                      })

df.head()

Unnamed: 0,s_w,s_l,p_w,p_l,f_c
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## pandas.dataframe.reindex()

In [16]:
# change index of dataframe 
# there might be loss of information
new.reindex(new.index*10, fill_value= 'missing')

Unnamed: 0,sepal_width,sepal_length,petal_width,petal_length
0,5.1,3.5,1.4,0.2
10,5.4,3.7,1.5,0.2
20,5.4,3.4,1.7,0.2
30,4.8,3.1,1.6,0.2
40,5,3.5,1.3,0.3
50,7,3.2,4.7,1.4
60,5,2,3.5,1
70,5.9,3.2,4.8,1.8
80,5.5,2.4,3.8,1.1
90,5.5,2.6,4.4,1.2
