# Data Wrangling - Pandas

## DataFrame Creation

In [None]:
pd.__version__

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display  #To display multiple outputs from a cell
pd.set_option('display.max_rows', 20) #Limit max number of rows to display
pd.set_option('expand_frame_repr', False) #To display all columns in a single horizontal view without line wraping

In [2]:
# From dict of lists
d = {'a': [1,2,3,4], 'b': [10,11,12,13]}
df = pd.DataFrame(d)
display(df)
display(df.index)
display(df.columns)
# Since index is automatically created by pandas, we can specify our own indexes
df = pd.DataFrame(d, index = ['zero', 'one', 'two', 'three'])
display(df)
# However, column names can only be rearranged, as it was specified during dataframe creation time
df = pd.DataFrame(d, index = ['zero', 'one', 'two', 'three'], columns = ['b', 'a'])
display(df)

Unnamed: 0,a,b
0,1,10
1,2,11
2,3,12
3,4,13


RangeIndex(start=0, stop=4, step=1)

Index(['a', 'b'], dtype='object')

Unnamed: 0,a,b
zero,1,10
one,2,11
two,3,12
three,4,13


Unnamed: 0,b,a
zero,10,1
one,11,2
two,12,3
three,13,4


In [3]:
# From list of tuples
d = [(1,2,3,4), (10,11,12,13)]
df = pd.DataFrame(d)
display(df)
display(df.index)
display(df.columns)
# Since index and column names are automatically created by pandas, we can specify our own indexes and column names
df = pd.DataFrame(d, index = ['zero', 'one'], columns = ['a', 'b','c','d'])
display(df)

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,10,11,12,13


RangeIndex(start=0, stop=2, step=1)

RangeIndex(start=0, stop=4, step=1)

Unnamed: 0,a,b,c,d
zero,1,2,3,4
one,10,11,12,13


In [4]:
# From list of dicts - values are filled row-wise
d = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(d)
display(df)
display(df.index)
display(df.columns)
# Since index is automatically created by pandas, we can specify our own indexes
df = pd.DataFrame(d, index = ['zero', 'one'])
display(df)
# However, column names can only be rearranged, as it was specified during dataframe creation time
df = pd.DataFrame(d, index = ['zero', 'one'], columns = ['c', 'b','a'])
display(df)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


RangeIndex(start=0, stop=2, step=1)

Index(['a', 'b', 'c'], dtype='object')

Unnamed: 0,a,b,c
zero,1,2,
one,5,10,20.0


Unnamed: 0,c,b,a
zero,,2,1
one,20.0,10,5


## Merging DataFrames

In [7]:
# By default merge does an inner join
# One-to-One

d1 = {'key': ['a','b', 'c','d'], 'data': [1,2,3,4]}
d2 = {'key': ['a','b', 'c','d', 'e'], 'data': [11,12,13,14,15]}
df1 = pd.DataFrame(d1)
df2 = pd.DataFrame(d2)
print('df1')
display(df1)
print('df2')
display(df2)

print("Inner Join")
display(pd.merge(df1,df2, on='key'))

print("Left Join") 
display(pd.merge(df1,df2, on='key', how='left'))

print("Right Join") 
display(pd.merge(df1,df2, on='key', how='right'))

print("Outer Join") 
display(pd.merge(df1,df2, on='key', how='outer'))



df1


Unnamed: 0,key,data
0,a,1
1,b,2
2,c,3
3,d,4


df2


Unnamed: 0,key,data
0,a,11
1,b,12
2,c,13
3,d,14
4,e,15


Inner Join


Unnamed: 0,key,data_x,data_y
0,a,1,11
1,b,2,12
2,c,3,13
3,d,4,14


Left Join


Unnamed: 0,key,data_x,data_y
0,a,1,11
1,b,2,12
2,c,3,13
3,d,4,14


Right Join


Unnamed: 0,key,data_x,data_y
0,a,1.0,11
1,b,2.0,12
2,c,3.0,13
3,d,4.0,14
4,e,,15


Outer Join


Unnamed: 0,key,data_x,data_y
0,a,1.0,11
1,b,2.0,12
2,c,3.0,13
3,d,4.0,14
4,e,,15


In [8]:
# Many-to-One

d1 = {'key': ['a','b', 'c','d', 'a'], 'data': [1,2,3,4,7]}
d2 = {'key': ['a','b', 'c','d', 'e'], 'data': [11,12,13,14,15]}
df1 = pd.DataFrame(d1)
df2 = pd.DataFrame(d2)
display(df1)
display(df2)

print("Inner Join")
display(pd.merge(df1,df2, on='key'))

print("Left Join")
display(pd.merge(df1,df2, on='key', how='left'))

print("Right Join")
display(pd.merge(df1,df2, on='key', how='right'))

print("Outer Join")
display(pd.merge(df1,df2, on='key', how='outer'))


Unnamed: 0,key,data
0,a,1
1,b,2
2,c,3
3,d,4
4,a,7


Unnamed: 0,key,data
0,a,11
1,b,12
2,c,13
3,d,14
4,e,15


Inner Join


Unnamed: 0,key,data_x,data_y
0,a,1,11
1,a,7,11
2,b,2,12
3,c,3,13
4,d,4,14


Left Join


Unnamed: 0,key,data_x,data_y
0,a,1,11
1,b,2,12
2,c,3,13
3,d,4,14
4,a,7,11


Right Join


Unnamed: 0,key,data_x,data_y
0,a,1.0,11
1,a,7.0,11
2,b,2.0,12
3,c,3.0,13
4,d,4.0,14
5,e,,15


Outer Join


Unnamed: 0,key,data_x,data_y
0,a,1.0,11
1,a,7.0,11
2,b,2.0,12
3,c,3.0,13
4,d,4.0,14
5,e,,15


In [9]:
# Many-to-Many

# Many-to-many joins form the Cartesian product of the rows

d1 = {'key': ['a','b', 'c','d', 'a', 'b'], 'data': [1,2,3,4,7,8]}
d2 = {'key': ['a','b', 'c','d', 'e', 'a', 'b'], 'data': [11,12,13,14,15,21,22]}
df1 = pd.DataFrame(d1)
df2 = pd.DataFrame(d2)
display(df1)
display(df2)

print("Inner Join")
display(pd.merge(df1,df2, on='key'))

print("Left Join")
display(pd.merge(df1,df2, on='key', how='left'))

print("Right Join")
display(pd.merge(df1,df2, on='key', how='right'))

print("Outer Join")
display(pd.merge(df1,df2, on='key', how='outer'))

Unnamed: 0,key,data
0,a,1
1,b,2
2,c,3
3,d,4
4,a,7
5,b,8


Unnamed: 0,key,data
0,a,11
1,b,12
2,c,13
3,d,14
4,e,15
5,a,21
6,b,22


Inner Join


Unnamed: 0,key,data_x,data_y
0,a,1,11
1,a,1,21
2,a,7,11
3,a,7,21
4,b,2,12
5,b,2,22
6,b,8,12
7,b,8,22
8,c,3,13
9,d,4,14


Left Join


Unnamed: 0,key,data_x,data_y
0,a,1,11
1,a,1,21
2,b,2,12
3,b,2,22
4,c,3,13
5,d,4,14
6,a,7,11
7,a,7,21
8,b,8,12
9,b,8,22


Right Join


Unnamed: 0,key,data_x,data_y
0,a,1.0,11
1,a,7.0,11
2,b,2.0,12
3,b,8.0,12
4,c,3.0,13
5,d,4.0,14
6,e,,15
7,a,1.0,21
8,a,7.0,21
9,b,2.0,22


Outer Join


Unnamed: 0,key,data_x,data_y
0,a,1.0,11
1,a,1.0,21
2,a,7.0,11
3,a,7.0,21
4,b,2.0,12
5,b,2.0,22
6,b,8.0,12
7,b,8.0,22
8,c,3.0,13
9,d,4.0,14


## Concatenating and Appending DataFrames

In [10]:
# Concat
d1 = {'a': [1,2,3,4], 'b': [11,12,13,14]}
d2 = {'a': [6,7,8,9], 'b': [16,17,18,19]}

df1 = pd.DataFrame(d1)
df2 = pd.DataFrame(d2)

display(df1)
display(df2)

print("Concat without ignoring index")
display(pd.concat([df1,df2]))

print("Concat with ignoring index")
display(pd.concat([df1,df2], ignore_index=True))

Unnamed: 0,a,b
0,1,11
1,2,12
2,3,13
3,4,14


Unnamed: 0,a,b
0,6,16
1,7,17
2,8,18
3,9,19


Concat without ignoring index


Unnamed: 0,a,b
0,1,11
1,2,12
2,3,13
3,4,14
0,6,16
1,7,17
2,8,18
3,9,19


Concat with ignoring index


Unnamed: 0,a,b
0,1,11
1,2,12
2,3,13
3,4,14
4,6,16
5,7,17
6,8,18
7,9,19


In [11]:
# Append

d1 = {'a': [1,2,3,4], 'b': [11,12,13,14]}
d2 = {'a': [6,7,8,9], 'b': [16,17,18,19]}

df1 = pd.DataFrame(d1)
df2 = pd.DataFrame(d2)

display(df1)
display(df2)

print("Append without ignoring index")
display(df1.append([df2]))

print("Append with ignoring index")
display(df1.append([df2], ignore_index=True))

Unnamed: 0,a,b
0,1,11
1,2,12
2,3,13
3,4,14


Unnamed: 0,a,b
0,6,16
1,7,17
2,8,18
3,9,19


Append without ignoring index


Unnamed: 0,a,b
0,1,11
1,2,12
2,3,13
3,4,14
0,6,16
1,7,17
2,8,18
3,9,19


Append with ignoring index


Unnamed: 0,a,b
0,1,11
1,2,12
2,3,13
3,4,14
4,6,16
5,7,17
6,8,18
7,9,19


## Patching - combine_first
“Patching” missing data in the calling object with data from the object you pass

In [13]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                 'b': [np.nan, 2., np.nan, 6.],
                 'c': range(2, 18, 4)})
display(df1)

df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                 'b': [np.nan, 3., 4., 6., 8.]})

display(df2)

print("Patched df1")
display(df1.combine_first(df2))

#if df2 has more rows than df1, they are appended 

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


Patched df1


Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


## Reading from a Text File

* read_csv     default delimiter = comma (,)
* read_table   default delimiter = tab ('\t')

In [14]:
#Read full/complete CSV file
df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv')
df.head(10)

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died
5,"Anderson, Mr. Harry",48.0,Male,1st,26.55,Survived
6,"Andrews, Miss. Kornelia Theodosia",63.0,Female,1st,77.96,Survived
7,"Andrews, Mr. Thomas Jr",39.0,Male,1st,0.0,Died
8,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",53.0,Female,1st,51.48,Survived
9,"Artagaveytia, Mr. Ramon",71.0,Male,1st,49.5,Died


In [15]:
#Read specified number of rows from CSV file
df_temp = pd.read_csv('./dataset/TITANIC_FORMATTED.csv', nrows=2)
df_temp

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1,Male,1st,151.55,Survived


In [16]:
#Read csv file using comma as separator
df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv', sep=',')
df.head(10)

#sep='\t' -->tab
#sep='\n' --> newline
#sep='\s+' --> fields separated by a variable amount of whitespace

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died
5,"Anderson, Mr. Harry",48.0,Male,1st,26.55,Survived
6,"Andrews, Miss. Kornelia Theodosia",63.0,Female,1st,77.96,Survived
7,"Andrews, Mr. Thomas Jr",39.0,Male,1st,0.0,Died
8,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",53.0,Female,1st,51.48,Survived
9,"Artagaveytia, Mr. Ramon",71.0,Male,1st,49.5,Died


In [17]:
#Do not treat first row as column names - treat first row as a record
df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv', header=None)
df.head(4)

Unnamed: 0,0,1,2,3,4,5
0,Name,Age,Gender,Class,Fare,Survival
1,"Allen, Miss. Elisabeth Walton",29,Female,1st,211.34,Survived
2,"Allison, Master. Hudson Trevor",1,Male,1st,151.55,Survived
3,"Allison, Miss. Helen Loraine",2,Female,1st,151.55,Died


In [18]:
#Do not treat first row as column names and skip the first row
df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv', header=None, skiprows=[0])
df.head(4)

Unnamed: 0,0,1,2,3,4,5
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died


In [19]:
#Do not treat first row as column names, skip the first row and put my custom field names
colnames =['Identity', 'Length of Life', 'Sex', 'Class', 'Price of Ticket', 'Outcome']
df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv', names = colnames, header=None, skiprows=[0])
df.head(4)

Unnamed: 0,Identity,Length of Life,Sex,Class,Price of Ticket,Outcome
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died


In [20]:
# Rename columns
df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv')
df.rename(columns={'Gender': 'Sex', 'Fare': 'Price'})


Unnamed: 0,Name,Age,Sex,Class,Price,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died
...,...,...,...,...,...,...
1304,"Zabour, Miss. Hileni",15.0,Female,3rd,14.45,Died
1305,"Zabour, Miss. Thamine",,Female,3rd,14.45,Died
1306,"Zakarian, Mr. Mapriededer",27.0,Male,3rd,7.23,Died
1307,"Zakarian, Mr. Ortin",27.0,Male,3rd,7.23,Died


In [21]:
#Treat one of the columns as index
df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv', index_col='Name')
df.head(4)

Unnamed: 0_level_0,Age,Gender,Class,Fare,Survival
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died


In [22]:
#Return data contained in the dataframe - indexes are not part of dataframe
df.values 

#Dataframe after treating one of the columns as index


array([[29.0, 'Female', '1st', 211.34, 'Survived'],
       [1.0, 'Male', '1st', 151.55, 'Survived'],
       [2.0, 'Female', '1st', 151.55, 'Died'],
       ...,
       [27.0, 'Male', '3rd', 7.23, 'Died'],
       [27.0, 'Male', '3rd', 7.23, 'Died'],
       [29.0, 'Male', '3rd', 7.88, 'Died']], dtype=object)

## Convert columns to desired datatype (overriding default pandas datatype identification)


Pandas Datatypes
* int64
* float64
* Object (aka string)
* datetime64, timedelta[ns] (used in time series)

In [23]:
df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv')
display(df.dtypes)

Name         object
Age         float64
Gender       object
Class        object
Fare        float64
Survival     object
dtype: object

In [24]:
# Converting datatype of a column containing missing values to int or float will yield 'Value Error!'
# df['Age'] = df['Age'].astype('int64')   # Uncomment to check Value Error!

In [27]:
df['Class'] = df['Class'].astype('category')
df['Class'].dtype

# In the examples above where we passed dtype='category', we used the default behavior:
# Categories are inferred from the data
# Categories are unordered

CategoricalDtype(categories=['1st', '2nd', '3rd'], ordered=False)

In [30]:
# To control those behaviors, instead of passing 'category', use an instance of CategoricalDtype

from pandas.api.types import CategoricalDtype
cat_type = CategoricalDtype(categories=['1st', '2nd', '3rd'], ordered=True)
df['Class'] = df['Class'].astype(cat_type)
df['Class'].dtype

CategoricalDtype(categories=['1st', '2nd', '3rd'], ordered=True)

In [31]:
df['Class'].cat.categories

Index(['1st', '2nd', '3rd'], dtype='object')

In [32]:
df['Class'].cat.ordered

True

In [33]:
df['Class'].cat.codes

0       0
1       0
2       0
3       0
4       0
       ..
1304    2
1305    2
1306    2
1307    2
1308    2
Length: 1309, dtype: int8

In [34]:
dict(enumerate(df['Class'].cat.categories))

{0: '1st', 1: '2nd', 2: '3rd'}

In [35]:
df['Class'] = df['Class'].cat.codes  #Note that after assigning codes to categorical variable it has converted to int datatype!
df.dtypes

Name         object
Age         float64
Gender       object
Class          int8
Fare        float64
Survival     object
dtype: object

In [36]:
# Since it has converted to int datatype, int operations are possible now
display(df['Class'].min())
display(df['Class'].max())
display(df['Class'].sort_values())


0

2

0       0
220     0
219     0
218     0
217     0
       ..
838     2
839     2
840     2
864     2
1308    2
Name: Class, Length: 1309, dtype: int8

## Random Sampling

In [37]:
# Generate random samples of dataframe - Without replacement
df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv')
display(df.sample(10))          # no. of samples to return 
display(df.sample(frac=0.1))    # % of dataframe to return

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
325,"Aldworth, Mr. Charles Augustus",30.0,Male,2nd,13.0,Died
250,"Ryerson, Miss. Emily Borie",18.0,Female,1st,262.38,Survived
322,"Young, Miss. Marie Grice",36.0,Female,1st,135.63,Survived
243,"Rosenshine, Mr. George (""Mr George Thorne"")",46.0,Male,1st,79.2,Died
894,"Johnson, Master. Harold Theodor",4.0,Male,3rd,11.13,Survived
86,"Daly, Mr. Peter Denis",51.0,Male,1st,26.55,Survived
687,"Braf, Miss. Elin Ester Maria",20.0,Female,3rd,7.85,Died
1007,"McGowan, Miss. Anna ""Annie""",15.0,Female,3rd,8.03,Survived
386,"Davies, Mr. Charles Henry",18.0,Male,2nd,73.5,Died
20,"Beckwith, Mr. Richard Leonard",37.0,Male,1st,52.55,Survived


Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
322,"Young, Miss. Marie Grice",36.0,Female,1st,135.63,Survived
919,"Katavelas, Mr. Vassilios (""Catavelas Vassilios"")",19.0,Male,3rd,7.23,Died
230,"Perreault, Miss. Anne",30.0,Female,1st,93.50,Survived
506,"Mitchell, Mr. Henry Michael",70.0,Male,2nd,10.50,Died
1176,"Sage, Mr. Douglas Bullen",,Male,3rd,69.55,Died
...,...,...,...,...,...,...
898,"Johnson, Mr. William Cahoone Jr",19.0,Male,3rd,0.00,Died
995,"Markoff, Mr. Marin",35.0,Male,3rd,7.90,Died
503,"Mellors, Mr. William John",19.0,Male,2nd,10.50,Survived
437,"Herman, Miss. Alice",24.0,Female,2nd,65.00,Survived


In [38]:
# Generate random samples of dataframe - With replacement

df.sample(10, replace=True)          # no. of samples to return
df.sample(frac=0.1, replace=True)    # % of dataframe to return

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
338,"Beauchamp, Mr. Henry James",28.0,Male,2nd,26.00,Died
557,"Shelley, Mrs. William (Imanita Parrish Hall)",25.0,Female,2nd,26.00,Survived
421,"Giles, Mr. Ralph",24.0,Male,2nd,13.50,Died
752,"Davies, Mr. Alfred J",24.0,Male,3rd,24.15,Died
819,"Glynn, Miss. Mary Agatha",,Female,3rd,7.75,Survived
...,...,...,...,...,...,...
59,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...",,Female,1st,27.72,Survived
1209,"Skoog, Miss. Margit Elizabeth",2.0,Female,3rd,27.90,Died
172,"Julian, Mr. Henry Forbes",50.0,Male,1st,26.00,Died
811,"Ford, Mrs. Edward (Margaret Ann Watson)",48.0,Female,3rd,34.38,Died


## Exploring Dataset

In [39]:
#List all column names
df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv')
list(df)

['Name', 'Age', 'Gender', 'Class', 'Fare', 'Survival']

In [40]:
#Row count
df.index

RangeIndex(start=0, stop=1309, step=1)

In [41]:
#Datatype of columns
df.dtypes

Name         object
Age         float64
Gender       object
Class        object
Fare        float64
Survival     object
dtype: object

In [42]:
#Number of elements in dataframe
df.size    

#1309 * 6 = 7854

7854

In [43]:
#Number of rows and columns in the dataframe
df.shape

(1309, 6)

## Common operations on Dataframe

In [44]:
# Retrieve a column
df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv')
df['Name'] # or df.Name

0                         Allen, Miss. Elisabeth Walton
1                        Allison, Master. Hudson Trevor
2                          Allison, Miss. Helen Loraine
3                  Allison, Mr. Hudson Joshua Creighton
4       Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
                             ...                       
1304                               Zabour, Miss. Hileni
1305                              Zabour, Miss. Thamine
1306                          Zakarian, Mr. Mapriededer
1307                                Zakarian, Mr. Ortin
1308                                 Zimmerman, Mr. Leo
Name: Name, Length: 1309, dtype: object

In [46]:
# Retrieve a row
df.iloc[0]

Name        Allen, Miss. Elisabeth Walton
Age                                  29.0
Gender                             Female
Class                                 1st
Fare                               211.34
Survival                         Survived
Name: 0, dtype: object

In [51]:
# Retrieve multiple rows
df.iloc[10:20]

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival,Insurance_Amt
10,"Astor, Col. John Jacob",47.0,Male,1st,227.53,Died,10000
11,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",18.0,Female,1st,227.53,Survived,10000
12,"Aubart, Mme. Leontine Pauline",24.0,Female,1st,69.3,Survived,10000
13,"Barber, Miss. Ellen ""Nellie""",26.0,Female,1st,78.85,Survived,10000
14,"Barkworth, Mr. Algernon Henry Wilson",80.0,Male,1st,30.0,Survived,10000
15,"Baumann, Mr. John D",,Male,1st,25.93,Died,10000
16,"Baxter, Mr. Quigg Edmond",24.0,Male,1st,247.52,Died,10000
17,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",50.0,Female,1st,247.52,Survived,10000
18,"Bazzani, Miss. Albina",32.0,Female,1st,76.29,Survived,10000
19,"Beattie, Mr. Thomson",36.0,Male,1st,75.24,Died,10000


In [53]:
# Retrive first 3 columns along with multiple rows
df.iloc[10:20, :3]

Unnamed: 0,Name,Age,Gender
10,"Astor, Col. John Jacob",47.0,Male
11,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",18.0,Female
12,"Aubart, Mme. Leontine Pauline",24.0,Female
13,"Barber, Miss. Ellen ""Nellie""",26.0,Female
14,"Barkworth, Mr. Algernon Henry Wilson",80.0,Male
15,"Baumann, Mr. John D",,Male
16,"Baxter, Mr. Quigg Edmond",24.0,Male
17,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",50.0,Female
18,"Bazzani, Miss. Albina",32.0,Female
19,"Beattie, Mr. Thomson",36.0,Male


In [47]:
# Retrieve a value corresponding to a particular row and column
df.iloc[0]['Age'] 

29.0

In [48]:
#Update/Modify/Append a column
#Columns can be modified by assignment
#Assigning a column that doesn’t exist will create a new column
#When assigning lists or arrays to a column, the value’s length must match the length of the DataFrame's column.
df['Insurance_Amt'] = 10000
df.head()

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival,Insurance_Amt
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived,10000
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived,10000
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died,10000
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died,10000
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died,10000


In [49]:
df.dtypes

Name              object
Age              float64
Gender            object
Class             object
Fare             float64
Survival          object
Insurance_Amt      int64
dtype: object

In [41]:
#Update/Modify a row
#Pandas updates using indexes and indexes are read-only hence to avoid getting weird results append the desired row 
#and delete the unwanted row
# df.ix[0] = ['xxx', 31.00, 'Male', '1st', 'Female', 150.00, 'Died', 10000]  -> can give weird results


In [54]:
#Append a row
df2 = {'Name': 'Zzzz Guy, Mr. Last', 'Age': 52, 'Gender': 'Male', 'Class': '1st', 'Fare': '511.34', 'Survival': 'Survived', 'Insurance_Amt': 7000}
df = df.append(df2, ignore_index=True)
df.tail()

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival,Insurance_Amt
1306,"Zakarian, Mr. Mapriededer",27.0,Male,3rd,7.23,Died,10000
1307,"Zakarian, Mr. Ortin",27.0,Male,3rd,7.23,Died,10000
1308,"Zimmerman, Mr. Leo",29.0,Male,3rd,7.88,Died,10000
1309,"Zzzz Guy, Mr. Last",52.0,Male,1st,511.34,Survived,7000
1310,"Zzzz Guy, Mr. Last",52.0,Male,1st,511.34,Survived,7000


In [55]:
#Drop a column
df = df.drop('Insurance_Amt', axis=1)
df.head()

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died


In [56]:
#Drop a row
df = df.drop([1309])
df.tail()

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
1305,"Zabour, Miss. Thamine",,Female,3rd,14.45,Died
1306,"Zakarian, Mr. Mapriededer",27.0,Male,3rd,7.23,Died
1307,"Zakarian, Mr. Ortin",27.0,Male,3rd,7.23,Died
1308,"Zimmerman, Mr. Leo",29.0,Male,3rd,7.88,Died
1310,"Zzzz Guy, Mr. Last",52.0,Male,1st,511.34,Survived


In [57]:
#Index objects are immutable and cannot be modified by user hence error!
#Index objects hold axis labels and other metadata
#index[1307] = 'xxxx'                        #uncomment this line to see the error

In [58]:
#Drop rows containing a particular value
df = df[df.Name != 'Zimmerman, Mr. Leo'] # Create a new dataframe by dropping the particular value and assigning to the original dataframe 
df.tail()

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
1304,"Zabour, Miss. Hileni",15.0,Female,3rd,14.45,Died
1305,"Zabour, Miss. Thamine",,Female,3rd,14.45,Died
1306,"Zakarian, Mr. Mapriededer",27.0,Male,3rd,7.23,Died
1307,"Zakarian, Mr. Ortin",27.0,Male,3rd,7.23,Died
1310,"Zzzz Guy, Mr. Last",52.0,Male,1st,511.34,Survived


In [59]:
#Select rows by index
df[0:14]

#Slicing with index labels includes the endpoints
#Slicing with index values excludes the last endpoint

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died
5,"Anderson, Mr. Harry",48.0,Male,1st,26.55,Survived
6,"Andrews, Miss. Kornelia Theodosia",63.0,Female,1st,77.96,Survived
7,"Andrews, Mr. Thomas Jr",39.0,Male,1st,0.0,Died
8,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",53.0,Female,1st,51.48,Survived
9,"Artagaveytia, Mr. Ramon",71.0,Male,1st,49.5,Died


In [60]:
#Select columns by index
df.Age[0:14]

0     29.0
1      1.0
2      2.0
3     30.0
4     25.0
5     48.0
6     63.0
7     39.0
8     53.0
9     71.0
10    47.0
11    18.0
12    24.0
13    26.0
Name: Age, dtype: float64

In [61]:
#Select rows meeting a particular cell condition 
df[df.Age > 70]

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
9,"Artagaveytia, Mr. Ramon",71.0,Male,1st,49.5,Died
14,"Barkworth, Mr. Algernon Henry Wilson",80.0,Male,1st,30.0,Survived
61,"Cavendish, Mrs. Tyrell William (Julia Florence...",76.0,Female,1st,78.85,Survived
135,"Goldschmidt, Mr. George B",71.0,Male,1st,34.65,Died
727,"Connors, Mr. Patrick",71.0,Male,3rd,7.75,Died
1235,"Svensson, Mr. Johan",74.0,Male,3rd,7.78,Died


In [62]:
#Sort the dataframe by one of the column values (ascending)
df.sort_values(by='Age')

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
1240,"Thomas, Master. Assad Alexander",0.0,Male,3rd,8.52,Survived
747,"Danbom, Master. Gilbert Sigvard Emanuel",0.0,Male,3rd,14.4,Died
763,"Dean, Miss. Elizabeth Gladys ""Millvina""",0.0,Female,3rd,20.58,Survived
492,"Mallet, Master. Andre",1.0,Male,2nd,37.0,Survived
1187,"Sandstrom, Miss. Beatrice Irene",1.0,Female,3rd,16.7,Survived
...,...,...,...,...,...,...
1293,"Williams, Mr. Howard Hugh ""Harry""",,Male,3rd,8.05,Died
1297,"Wiseman, Mr. Phillippe",,Male,3rd,7.25,Died
1302,"Yousif, Mr. Wazli",,Male,3rd,7.23,Died
1303,"Yousseff, Mr. Gerious",,Male,3rd,14.46,Died


In [63]:
# Groupby - For Categorical (can be many) and Continuous (usually one) variables
pd.set_option('display.max_rows', 50)
df['Age'].groupby(df['Gender'])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7ff6bd2b1340>

In [64]:
df['Age'].groupby(df['Gender']).describe()   # Runtime warning due to NaN values

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,388.0,28.693299,14.573391,0.0,19.0,27.0,38.0,76.0
Male,658.0,30.642857,14.308467,0.0,21.0,28.0,39.0,80.0


In [65]:
df['Age'].groupby([df['Gender'], df['Class']])

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7ff6bd2b1700>

In [66]:
df['Age'].groupby([df['Gender'], df['Class']]).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
Gender,Class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Female,1st,133.0,37.037594,14.27246,2.0,24.0,36.0,48.0,76.0
Female,2nd,103.0,27.504854,12.912127,1.0,20.0,28.0,34.0,60.0
Female,3rd,152.0,22.197368,12.200879,0.0,16.0,22.0,30.0,63.0
Male,1st,152.0,41.111842,14.55228,1.0,30.0,42.0,50.25,80.0
Male,2nd,158.0,30.829114,13.967293,1.0,23.0,29.5,38.75,70.0
Male,3rd,348.0,25.985632,11.723388,0.0,20.0,25.0,32.0,74.0


In [67]:
# Calculate group size
df['Age'].groupby([df['Gender'], df['Class']]).size()  

Gender  Class
Female  1st      144
        2nd      106
        3rd      216
Male    1st      180
        2nd      171
        3rd      492
Name: Age, dtype: int64

In [68]:
#Impute missing values using mean of grouped category
display(df['Age'].groupby([df['Gender'], df['Class']]).mean())

display(df['Age'].isnull().sum())

fill_mean = lambda g: g.fillna(g.mean())
df['Age'].groupby([df['Gender'], df['Class']]).apply(fill_mean)

display(df['Age'].isnull().sum())

Gender  Class
Female  1st      37.037594
        2nd      27.504854
        3rd      22.197368
Male    1st      41.111842
        2nd      30.829114
        3rd      25.985632
Name: Age, dtype: float64

263

263

In [69]:
# Crosstab - computing group frequencies between categorical variables
display(pd.crosstab(df['Gender'], df['Class']))

display(pd.crosstab(df['Gender'], df['Class'], margins=True))

#Note: There are no NaN values for Gender and Class columns

Class,1st,2nd,3rd
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,144,106,216
Male,180,171,492


Class,1st,2nd,3rd,All
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,144,106,216,466
Male,180,171,492,843
All,324,277,708,1309


## Missing Values
* Missing data is usually either not present (empty cell) or marked by some sentinel value.
* By default, pandas consider NA, -1.#IND, NULL and empty cell as missing values while reading files. 
* Missing values are denoted as NaN in pandas dataframe

In [70]:
#NA, NULL, empty cell and -1.#IND is considered as missing values while reading files 
df_miss = pd.read_csv('./dataset/temp.csv')
df_miss

Unnamed: 0,a,b,c,d
0,1,2,3.0,4
1,,,,
2,$$,xxx,0.0,no


In [71]:
#Check if there are missing values in the dataframe
pd.isnull(df_miss)

Unnamed: 0,a,b,c,d
0,False,False,False,False
1,True,True,True,True
2,False,False,False,False


In [72]:
#Pass a list of values that should be considered as missing (apart from the default ones) while reading the file
missing_values = ['xxx', 'no', 0, '$$']
df_miss = pd.read_csv('./dataset/temp.csv', na_values=missing_values)
df_miss

Unnamed: 0,a,b,c,d
0,1.0,2.0,3.0,4.0
1,,,,
2,,,,


In [73]:
# Count of Missing values per column
df.isnull().sum()

Name          0
Age         263
Gender        0
Class         0
Fare          1
Survival      0
dtype: int64

In [74]:
# Display all records containing missing data
df[df.isnull().any(axis=1)]

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
15,"Baumann, Mr. John D",,Male,1st,25.93,Died
37,"Bradley, Mr. George (""George Arthur Brayton"")",,Male,1st,26.55,Survived
40,"Brewe, Dr. Arthur Jackson",,Male,1st,39.6,Died
46,"Cairns, Mr. Alexander",,Male,1st,31.0,Died
59,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...",,Female,1st,27.72,Survived
...,...,...,...,...,...,...
1293,"Williams, Mr. Howard Hugh ""Harry""",,Male,3rd,8.05,Died
1297,"Wiseman, Mr. Phillippe",,Male,3rd,7.25,Died
1302,"Yousif, Mr. Wazli",,Male,3rd,7.23,Died
1303,"Yousseff, Mr. Gerious",,Male,3rd,14.46,Died


In [75]:
#Drop rows containing any missing values
df = df.dropna()
df

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died
...,...,...,...,...,...,...
1301,"Youseff, Mr. Gerious",46.0,Male,3rd,7.23,Died
1304,"Zabour, Miss. Hileni",15.0,Female,3rd,14.45,Died
1306,"Zakarian, Mr. Mapriededer",27.0,Male,3rd,7.23,Died
1307,"Zakarian, Mr. Ortin",27.0,Male,3rd,7.23,Died


In [76]:
#Drop rows containing all missing values
df_miss = df_miss.dropna(how='all')
df_miss

Unnamed: 0,a,b,c,d
0,1.0,2.0,3.0,4.0


In [77]:
#Drop rows that have atleast 2 NaN values
df_miss = pd.read_csv('./dataset/temp.csv')
df_miss.dropna(thresh=2)

Unnamed: 0,a,b,c,d
0,1,2,3.0,4
2,$$,xxx,0.0,no


In [78]:
#Only drop rows containing NaN in a particular column
df_miss = pd.read_csv('./dataset/temp.csv')
df_miss.dropna(subset=['c'])

Unnamed: 0,a,b,c,d
0,1,2,3.0,4
2,$$,xxx,0.0,no


In [79]:
#Fill missing values with a particular value
df_miss = pd.read_csv('./dataset/temp.csv')
df_miss = df_miss.fillna(0)
df_miss

Unnamed: 0,a,b,c,d
0,1,2,3.0,4
1,0,0,0.0,0
2,$$,xxx,0.0,no


In [133]:
#Fill missing value by taking mean of that column
df_missing = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                 'b': [np.nan, 2., np.nan, 6.],
                 'c': range(2, 18, 4)})
print("Before Imputation")
print(df_missing)


# Get the mean of the column
mean_value = df_missing['a'].mean()
# Fill missing values using .fillna()
df_missing['a'] = df_missing['a'].fillna(mean_value)

print("After Imputation")
print(df_missing)

Before Imputation
     a    b   c
0  1.0  NaN   2
1  NaN  2.0   6
2  5.0  NaN  10
3  NaN  6.0  14
After Imputation
     a    b   c
0  1.0  NaN   2
1  3.0  2.0   6
2  5.0  NaN  10
3  3.0  6.0  14


## Duplicates

In [80]:
# Find whether duplicate records exist
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1301    False
1304    False
1306    False
1307    False
1310    False
Length: 1045, dtype: bool

In [81]:
# Display duplicate records
df[df.duplicated() == True]

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival


In [82]:
# Drop duplicate records
df.drop_duplicates()

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died
...,...,...,...,...,...,...
1301,"Youseff, Mr. Gerious",46.0,Male,3rd,7.23,Died
1304,"Zabour, Miss. Hileni",15.0,Female,3rd,14.45,Died
1306,"Zakarian, Mr. Mapriededer",27.0,Male,3rd,7.23,Died
1307,"Zakarian, Mr. Ortin",27.0,Male,3rd,7.23,Died


In [83]:
# Drop records (except first one) if values in a particular column matches
df.drop_duplicates(['Name'])

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died
...,...,...,...,...,...,...
1301,"Youseff, Mr. Gerious",46.0,Male,3rd,7.23,Died
1304,"Zabour, Miss. Hileni",15.0,Female,3rd,14.45,Died
1306,"Zakarian, Mr. Mapriededer",27.0,Male,3rd,7.23,Died
1307,"Zakarian, Mr. Ortin",27.0,Male,3rd,7.23,Died


## Replacing Values

In [84]:
# Replace all 'from' values in dataframe to 'to' values
# One-to-One replacement
# First argument is 'from', second argument is 'to'
df.replace(14.45, 14)

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died
...,...,...,...,...,...,...
1301,"Youseff, Mr. Gerious",46.0,Male,3rd,7.23,Died
1304,"Zabour, Miss. Hileni",15.0,Female,3rd,14,Died
1306,"Zakarian, Mr. Mapriededer",27.0,Male,3rd,7.23,Died
1307,"Zakarian, Mr. Ortin",27.0,Male,3rd,7.23,Died


In [85]:
# Replace all multiple 'from' values in dataframe to 'to' value
# Many-to-One replacement
# First argument is 'from', second argument is 'to'
df.replace([14.00, 7.23], 10.00)

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died
...,...,...,...,...,...,...
1301,"Youseff, Mr. Gerious",46.0,Male,3rd,10.0,Died
1304,"Zabour, Miss. Hileni",15.0,Female,3rd,14.45,Died
1306,"Zakarian, Mr. Mapriededer",27.0,Male,3rd,10.0,Died
1307,"Zakarian, Mr. Ortin",27.0,Male,3rd,10.0,Died


In [86]:
# Replace multiple 'from' values in dataframe to multiple 'to' values with one to one correspondence
# Many-to-Many replacement preserving correspondence
# First argument is 'from', second argument is 'to'
df.replace([14.45, 7.88], [11.00, 9.00])

# 14.45 -> 11.00 & 7.88 -> 9.00

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
2,"Allison, Miss. Helen Loraine",2.0,Female,1st,151.55,Died
3,"Allison, Mr. Hudson Joshua Creighton",30.0,Male,1st,151.55,Died
4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,Female,1st,151.55,Died
...,...,...,...,...,...,...
1301,"Youseff, Mr. Gerious",46.0,Male,3rd,7.23,Died
1304,"Zabour, Miss. Hileni",15.0,Female,3rd,11.0,Died
1306,"Zakarian, Mr. Mapriededer",27.0,Male,3rd,7.23,Died
1307,"Zakarian, Mr. Ortin",27.0,Male,3rd,7.23,Died


## Descriptive Statistics on Dataframes

All of the descriptive statistics on pandas objects exclude missing data

In [87]:
#By default, summary statistics are generated for quantitative variables on a dataframe

df = pd.read_csv('./dataset/TITANIC_FORMATTED.csv')
df = df.dropna()
df.describe()

#count - total count of values excluding NA values, if present

Unnamed: 0,Age,Fare
count,1045.0,1045.0
mean,29.867943,36.686852
std,14.389684,55.73258
min,0.0,0.0
25%,21.0,8.05
50%,28.0,15.75
75%,39.0,35.5
max,80.0,512.33


In [88]:
#Passing the parameter (include='all') to describe() will generate summary statistics for all variables
df.describe(include='all')

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
count,1045,1045.0,1045,1045,1045.0,1045
unique,1043,,2,3,,2
top,"Kelly, Mr. James",,Male,3rd,,Died
freq,2,,657,500,,618
mean,,29.867943,,,36.686852,
std,,14.389684,,,55.73258,
min,,0.0,,,0.0,
25%,,21.0,,,8.05,
50%,,28.0,,,15.75,
75%,,39.0,,,35.5,


In [89]:
#Exclude float variables
df.describe(exclude=[np.float])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df.describe(exclude=[np.float])


Unnamed: 0,Name,Gender,Class,Survival
count,1045,1045,1045,1045
unique,1043,2,3,2
top,"Kelly, Mr. James",Male,3rd,Died
freq,2,657,500,618


In [90]:
#Summary statistics for a quantitative variable
df['Age'].describe()

count    1045.000000
mean       29.867943
std        14.389684
min         0.000000
25%        21.000000
50%        28.000000
75%        39.000000
max        80.000000
Name: Age, dtype: float64

In [91]:
#Summary statistics for qualitative variable
df['Name'].describe()

#unique - total count of unique values
#top - value with the highest frequency
#freq - frequency of top value 

count                 1045
unique                1043
top       Kelly, Mr. James
freq                     2
Name: Name, dtype: object

In [92]:
#Skewness of a quantitative variable
df['Age'].skew()

0.4065722240203419

In [93]:
#Kurtosis of a quantitative variable
df['Age'].kurt()

0.15463344200588924

In [94]:
#By default, Correlation matrix is generated for all quantitative variables of dataframe
df.corr()

Unnamed: 0,Age,Fare
Age,1.0,0.178415
Fare,0.178415,1.0


In [95]:
#Correlation between two quantitative variables/columns
#X.corrwith(Y)  where X = dataframe and Y = dataframe/Series
# df['Age'].corrwith(df.Fare)             # uncomment this line to see that it does not work as df['Age'] is a series and not a dataframe

In [96]:
df_Age_Frame = df['Age'].to_frame() # Convert series to dataframe
df_Age_Frame.corrwith(df.Fare)

Age    0.178415
dtype: float64

In [97]:
#Categorical variables - all unique values/classes in a column
df['Name'].unique()

array(['Allen, Miss. Elisabeth Walton', 'Allison, Master. Hudson Trevor',
       'Allison, Miss. Helen Loraine', ..., 'Zakarian, Mr. Mapriededer',
       'Zakarian, Mr. Ortin', 'Zimmerman, Mr. Leo'], dtype=object)

In [98]:
#Categorical variables - count of all unique values/classes in a column
df['Class'].value_counts()

3rd    500
1st    284
2nd    261
Name: Class, dtype: int64

In [99]:
#Categorical variables - Membership  
mask = df['Survival'].isin(['Survived'])
mask

0        True
1        True
2       False
3       False
4       False
        ...  
1301    False
1304    False
1306    False
1307    False
1308    False
Name: Survival, Length: 1045, dtype: bool

In [100]:
df[mask]

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,"Allen, Miss. Elisabeth Walton",29.0,Female,1st,211.34,Survived
1,"Allison, Master. Hudson Trevor",1.0,Male,1st,151.55,Survived
5,"Anderson, Mr. Harry",48.0,Male,1st,26.55,Survived
6,"Andrews, Miss. Kornelia Theodosia",63.0,Female,1st,77.96,Survived
8,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",53.0,Female,1st,51.48,Survived
...,...,...,...,...,...,...
1261,"Turkula, Mrs. (Hedwig)",63.0,Female,3rd,9.59,Survived
1277,"Vartanian, Mr. David",22.0,Male,3rd,7.23,Survived
1286,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",38.0,Female,3rd,7.23,Survived
1290,"Wilkes, Mrs. James (Ellen Needs)",47.0,Female,3rd,7.00,Survived


## Simple Transformations

In [101]:
df['Age'] * 2

0       58.0
1        2.0
2        4.0
3       60.0
4       50.0
        ... 
1301    92.0
1304    30.0
1306    54.0
1307    54.0
1308    58.0
Name: Age, Length: 1045, dtype: float64

In [102]:
df['Fare'] + df['Age'] 

0       240.34
1       152.55
2       153.55
3       181.55
4       176.55
         ...  
1301     53.23
1304     29.45
1306     34.23
1307     34.23
1308     36.88
Length: 1045, dtype: float64

In [103]:
df['Fare'] - df['Age'] 

0       182.34
1       150.55
2       149.55
3       121.55
4       126.55
         ...  
1301    -38.77
1304     -0.55
1306    -19.77
1307    -19.77
1308    -21.12
Length: 1045, dtype: float64

In [104]:
df['Fare'] * df['Age'] 

0       6128.86
1        151.55
2        303.10
3       4546.50
4       3788.75
         ...   
1301     332.58
1304     216.75
1306     195.21
1307     195.21
1308     228.52
Length: 1045, dtype: float64

In [105]:
df['Fare'] / df['Age'] 

0         7.287586
1       151.550000
2        75.775000
3         5.051667
4         6.062000
           ...    
1301      0.157174
1304      0.963333
1306      0.267778
1307      0.267778
1308      0.271724
Length: 1045, dtype: float64

In [106]:
#Element-wise transformation of series
f = lambda x: abs(x) #can be user defined function as well
df['Age'].map(f)

0       29.0
1        1.0
2        2.0
3       30.0
4       25.0
        ... 
1301    46.0
1304    15.0
1306    27.0
1307    27.0
1308    29.0
Name: Age, Length: 1045, dtype: float64

In [116]:
#Element-wise transformation of entire dataframe
f = lambda x: True if x != 0 else False
df.applymap(f)

# Deprecated since version 2.1.0, use map instead 

Unnamed: 0,Name,Age,Gender,Class,Fare,Survival
0,True,True,True,True,True,True
1,True,True,True,True,True,True
2,True,True,True,True,True,True
3,True,True,True,True,True,True
4,True,True,True,True,True,True
...,...,...,...,...,...,...
1301,True,True,True,True,True,True
1304,True,True,True,True,True,True
1306,True,True,True,True,True,True
1307,True,True,True,True,True,True


In [117]:
#Row or column wise transformation on dataframe
#Objects passed to functions are Series objects having index either the DataFrame’s index or the columns 
#axis = 0  - apply function to each column (default)
#axis = 1 - apply function to each row
f = lambda x: x.count()
df.apply(f)

Name        1045
Age         1045
Gender      1045
Class       1045
Fare        1045
Survival    1045
dtype: int64

In [118]:
# Creating Dummy variables for categorial variables

dummies = pd.get_dummies(df['Gender'], prefix='key')
dummies

Unnamed: 0,key_Female,key_Male
0,1,0
1,0,1
2,1,0
3,0,1
4,1,0
...,...,...
1301,0,1
1304,1,0
1306,0,1
1307,0,1


In [119]:
# Join Dummy variables to the original dataframe
df_with_dummy = df[['Gender']].join(dummies)
df_with_dummy

Unnamed: 0,Gender,key_Female,key_Male
0,Female,1,0
1,Male,0,1
2,Female,1,0
3,Male,0,1
4,Female,1,0
...,...,...,...
1301,Male,0,1
1304,Female,1,0
1306,Male,0,1
1307,Male,0,1


## Write to file

In [99]:
#Write to csv file specifying separator and whether index and headers need to be written as well
#Missing values are written as empty strings
df.to_csv('./dataset/Write_file.csv', sep = ',', index=False, header=False)