# Pre Processing data
This notebook gives a basic overview of what you can achieve by pre-processing data

In [None]:
import pandas as pd
import numpy as np

def build_nan_df():
    table = [[1 , 2, 3, 10], [3, 4, 8, None]]
    df = pd.DataFrame(table)
    df = df.transpose()
    df.columns = ['Heading1', 'Heading2']
    return df

df = build_nan_df()
df


#### Missing Values

In [None]:
df.dropna()
df

In [None]:
df.dropna(axis=0, inplace=True) # drops rows with NaN, axis=1 drops columns
df

In [None]:
df = build_nan_df()
mean = df['Heading2'].mean()
df['Heading2'] = df['Heading2'].replace(np.nan, mean)
df


In [None]:
df = build_nan_df()
df['Heading2'].fillna((df['Heading2'].mean()), inplace=True)
df

#### Data Normalization

In [None]:
def build_df():
    table = [[1 , 2, 3, 10], [300, 488, 887, 370]]
    df = pd.DataFrame(table)
    df = df.transpose()
    df.columns = ['x1', 'x2']
    return df
df = build_df()
df

In [None]:
## Divide by Max value

df['x1'] = df['x1']/df['x1'].max()
df['x2'] = df['x2']/df['x2'].max()

df

In [None]:
## Min Max Method
df = build_df()
def min_max(df, column_name):
    df[column_name] = (df[column_name] - df[column_name].min())/(df[column_name].max() - df[column_name].min()) 
min_max(df, 'x1')
min_max(df, 'x2')
df

In [None]:
## Z-score method
df = build_df()
df['x1'] = (df['x1']-df['x1'].mean())/df['x1'].std()
df['x2'] = (df['x2']-df['x2'].mean())/df['x2'].std()

df

#### Binning

In [None]:
df = build_df()
bins = np.linspace(min(df['x2']), max(df['x2']), 4)
group_names = ['Low', 'Medium', 'High']
df['x2-binned'] = pd.cut(df['x2'], bins, labels=group_names, include_lowest=True)
df

In [None]:
bins

In [None]:
def build_df():
    table = [[1 , 2, 3, 10], [300, 488, 887, 370], ['blue', 'red', 'orange', 'blue']]
    df = pd.DataFrame(table)
    df = df.transpose()
    df.columns = ['x1', 'x2', 'color']
    return df
df = build_df()
df

In [None]:
pd.get_dummies(df['color'])