# Pre Processing data
This notebook gives a basic overview of what you can achieve by pre-processing data

In [38]:
import pandas as pd
import numpy as np

def build_nan_df():
    table = [[1 , 2, 3, 10], [3, 4, 8, None]]
    df = pd.DataFrame(table)
    df = df.transpose()
    df.columns = ['Heading1', 'Heading2']
    return df

df = build_nan_df()
df


Unnamed: 0,Heading1,Heading2
0,1.0,3.0
1,2.0,4.0
2,3.0,8.0
3,10.0,


#### Missing Values

In [27]:
df.dropna()
df

Unnamed: 0,Heading1,Heading2
0,1.0,3.0
1,2.0,4.0
2,3.0,8.0
3,10.0,


In [29]:
df.dropna(axis=0, inplace=True) # drops rows with NaN, axis=1 drops columns
df

Unnamed: 0,Heading1,Heading2
0,1.0,3.0
1,2.0,4.0
2,3.0,8.0


In [35]:
df = build_nan_df()
mean = df['Heading2'].mean()
df['Heading2'] = df['Heading2'].replace(np.nan, mean)
df


Unnamed: 0,Heading1,Heading2
0,1.0,3.0
1,2.0,4.0
2,3.0,8.0
3,10.0,5.0


In [37]:
df = build_nan_df()
df['Heading2'].fillna((df['Heading2'].mean()), inplace=True)
df

Unnamed: 0,Heading1,Heading2
0,1.0,3.0
1,2.0,4.0
2,3.0,8.0
3,10.0,5.0


#### Data Normalization

In [43]:
def build_df():
    table = [[1 , 2, 3, 10], [300, 488, 887, 370]]
    df = pd.DataFrame(table)
    df = df.transpose()
    df.columns = ['x1', 'x2']
    return df
df = build_df()
df

Unnamed: 0,x1,x2
0,1,300
1,2,488
2,3,887
3,10,370


In [45]:
## Divide by Max value

df['x1'] = df['x1']/df['x1'].max()
df['x2'] = df['x2']/df['x2'].max()

df

Unnamed: 0,x1,x2
0,0.1,0.338219
1,0.2,0.550169
2,0.3,1.0
3,1.0,0.417136


In [47]:
## Min Max Method
df = build_df()
def min_max(df, column_name):
    df[column_name] = (df[column_name] - df[column_name].min())/(df[column_name].max() - df[column_name].min()) 
min_max(df, 'x1')
min_max(df, 'x2')
df

Unnamed: 0,x1,x2
0,0.0,0.0
1,0.111111,0.320273
2,0.222222,1.0
3,1.0,0.11925


In [49]:
## Z-score method
df = build_df()
df['x1'] = (df['x1']-df['x1'].mean())/df['x1'].std()
df['x2'] = (df['x2']-df['x2'].mean())/df['x2'].std()

df

Unnamed: 0,x1,x2
0,-0.734847,-0.805565
1,-0.489898,-0.08866
2,-0.244949,1.432857
3,1.469694,-0.538632


#### Binning

In [52]:
df = build_df()
bins = np.linspace(min(df['x2']), max(df['x2']), 4)
group_names = ['Low', 'Medium', 'High']
df['x2-binned'] = pd.cut(df['x2'], bins, labels=group_names, include_lowest=True)
df

Unnamed: 0,x1,x2,x2-binned
0,1,300,Low
1,2,488,Low
2,3,887,High
3,10,370,Low


In [53]:
bins

array([300.        , 495.66666667, 691.33333333, 887.        ])

In [55]:
def build_df():
    table = [[1 , 2, 3, 10], [300, 488, 887, 370], ['blue', 'red', 'orange', 'blue']]
    df = pd.DataFrame(table)
    df = df.transpose()
    df.columns = ['x1', 'x2', 'color']
    return df
df = build_df()
df

Unnamed: 0,x1,x2,color
0,1,300,blue
1,2,488,red
2,3,887,orange
3,10,370,blue


In [56]:
pd.get_dummies(df['color'])

Unnamed: 0,blue,orange,red
0,1,0,0
1,0,0,1
2,0,1,0
3,1,0,0
