# Overview of Data Prepocessing

In [36]:
#In general , raw data stored in a database may not be in an understandable format.
#Often , raw data can be in consistent , incomplete, and may contain many errors.
#Thus, we need to improve the quality of this data and simplify it by a method called data preprocessing.

In [3]:
#Data cleaning : Data cleaning is the process of removing noise and correcting inconsistencies in the data.

#Data integration : It involves merging of data from diferent sources.

#Data Transformation : Also known as normalisation, is the process of organising database to remove redundancy.

#Data Reduction : It is the process of reducing data size by eliminating redundant features.

# Data Cleaning :

Filling in missing values :

In [10]:
#Missing data elements in a database is a common problem in the real-world scenario.
#The accuracy of prediction gets affected as missing data elements , results in poor-quality database.

#Let us see how to handle missing values (say,NA or NaN) using pandas:
import pandas as pd
import numpy as np
df=pd.DataFrame(np.random.randn(5,3),index=['a','c','e','f','h'],columns=['one','two','three'])
df = df.reindex(['a','b','c','d','e','f','g','h'])
print(df)

        one       two     three
a  0.343040  0.466260  0.101841
b       NaN       NaN       NaN
c -0.331879  0.695332 -0.053839
d       NaN       NaN       NaN
e -0.904229  0.559250 -0.430447
f  0.928770 -0.742984 -1.178588
g       NaN       NaN       NaN
h -0.055059  0.494720 -0.077603


In [13]:
#To detect missing values in DataFrame , Pandas provides two functions on DataFrame objects:
#isnull() and notnull()


#isnull():
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5,3),index = ['a','c','e','f','h'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e','f','g','h'])
print (df['one'].isnull())


a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool


In [14]:
#notnull():
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5,3),index = ['a','c','e','f','h'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e','f','g','h'])
print (df['one'].notnull())

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool


Calculation with Missing Data:

In [36]:
#when the sum of data is calculated, the not a number(NaN) will be treated as zero. 
#if the entries of data are all NaN, then the sum will be NaN.
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5,3),index = ['a','c','e','f','h'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e','f','g','h'])
print (df['one'].sum())


3.1884055137723077


In [33]:
import pandas as pd
import numpy as np
df = pd.DataFrame(index=[0,1,2,3,4,5],columns = ['one','two'])
print (df['one'].sum())

0


Cleaning and filling missing data :

In [42]:
#The fillna function can "fill in" NaN values with non-null data in different methods.

#Replace NaN with a scalar value:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(3,3),index=['a','c','e'],columns =['one ', 'two', 'three'])
df = df.reindex(['a','b','c'])
print (df)
print ("NaN replaced with '0'")
print (df.fillna(0))

       one        two     three
a -0.617463 -0.804198  1.141566
b       NaN       NaN       NaN
c  0.771097 -0.701499 -0.668091
NaN replaced with '0'
       one        two     three
a -0.617463 -0.804198  1.141566
b  0.000000  0.000000  0.000000
c  0.771097 -0.701499 -0.668091


Fill NA forward and backward :

In [44]:
#Forward and backward filling involves filling the missing data objects above or below the NaN values.
#If the data object prior to the missing value is filled , then it is called forward filling.
#if the data object next to the missing value is filled,then it is called backward filling .
#pad/fill : Fill methods forward.
#bfill/backfill : Fill methods backward.

#pad/fill :
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5,3),index = ['a','c','e','f','h'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e','f','g','h'])
print (df.fillna(method='pad'))

        one       two     three
a  1.055712 -0.735200 -1.775587
b  1.055712 -0.735200 -1.775587
c -0.239658 -0.023311 -0.038379
d -0.239658 -0.023311 -0.038379
e  0.882769  0.069599  0.244264
f -0.066037  0.494994  1.120512
g -0.066037  0.494994  1.120512
h  0.977151  0.762650 -0.247565


In [46]:
#bfill/backfill :
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5,3),index = ['a','c','e','f','h'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e','f','g','h'])
print (df.fillna(method='bfill'))

        one       two     three
a  1.965099 -0.107059  0.900731
b  0.834035 -0.574718 -0.078340
c  0.834035 -0.574718 -0.078340
d -0.497421  0.854410  0.636999
e -0.497421  0.854410  0.636999
f  0.228755 -0.577907 -1.102702
g  0.847834  0.679947 -0.218142
h  0.847834  0.679947 -0.218142


Drop missing Values :

In [50]:
 #If the missing values have to be removed from the analysis, then we use the (dropna())  function along with axis argument.
#By default , axis = 0(i.e.,along a row), which means that if any value within a rowis NaN then the whole row is excluded.
#for rows
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5,3),index = ['a','c','e','f','h'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e','f','g','h'])
print (df.dropna())

        one       two     three
a  0.100796  0.558939  0.488761
c -0.070478  2.232041  0.193037
e -1.271047  0.468052 -0.623290
f  1.739567  0.099446  1.116792
h -2.293399  0.064282  0.025077


In [53]:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(5,3),index = ['a','c','e','f','h'],columns = ['one','two','three'])
df = df.reindex(['a','b','c','d','e','f','g','h'])
print (df.dropna(axis = 1))

Empty DataFrame
Columns: []
Index: [a, b, c, d, e, f, g, h]


Replace missing (or) Generic Values :

In [58]:
#A generic value can be replaced with a specific value.
#This is achieved by applying the replace method.
#Replacing NaN with a Scalar value is the equivalent behaviour of the fillna() .
import pandas as pd
import numpy as np
df = pd.DataFrame({'one':[10,20,30,40,50,2000],'two':[1000,0,30,40,50,60]})
print (df.replace({1000:10,2000:60}))

   one  two
0   10   10
1   20    0
2   30   30
3   40   40
4   50   50
5   60   60
