In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Types Of Transformation
#### 1. Normalization And Standardization
#### 2. Scaling to Minimum And Maximum values
#### 3. Scaling To Median And Quantiles
#### 4. Guassian Transformation 

    a. Logarithmic Transformation 
    
    b. Reciprocal Transformation 
    
    c. Square Root Transformation 
    
    d. Exponential Transformation
    
    e. Box Cox Transformation

## 1. Standardization
It is used more in Machine Learning algos

If the data is with respect to Standard Normal Distribution it is used

It is most commonly used as it works with most of the algorithms

We try to bring all the variables or features to a similar scale. Standardization means centering the variable at zero. z=(x-x_mean)/std

In [None]:
import pandas as pd
df=pd.read_csv('../input/titanic/train.csv', usecols=['Pclass','Age','Fare','Survived'])
df.head()

In [None]:
#Checking the null values
df.isnull().sum()

In [None]:
#Replacing the null values with median
df['Age'].fillna(df.Age.median(),inplace=True)

In [None]:
#### standarisation: We use the Standardscaler from sklearn library
from sklearn.preprocessing import StandardScaler
#creating an object of StandardScaler
scalar = StandardScaler()

In [None]:
#Fit vs Fit_transform
#Fit only fits the model whereas fit_transform tranforms the data during fitting

In [None]:
#fitting the standardscaler object to the dataset and tranforming it using fit_transform
df_scaled = scalar.fit_transform(df)
df_scaled

In [None]:
pd.DataFrame(df_scaled).head()

In [None]:
plt.hist(df_scaled[:,2],bins=20)

## 2. MixMax Scaling
It works well with CNN - (Deeplearing)

It transforms the values between 0 and 1. X_scaled = (X - X.min / (X.max - X.min)

In [None]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
df_minmax = pd.DataFrame(minmax.fit_transform(df),columns=df.columns)
df_minmax

In [None]:
plt.hist(df_minmax['Age'],bins=20)

## Robust Scaler

It is used to scale the feature to median and quantiles Scaling using median and quantiles consists of substracting the median to all the observations, and then dividing by the interquantile difference. The interquantile difference is the difference between the 75th and 25th quantile:

IQR = 75th quantile - 25th quantile

X_scaled = (X - X.median) / IQR

0,1,2,3,4,5,6,7,8,9,10

9-90 percentile---90% of all values in this group is less than 9 1-10 precentile---10% of all values in this group is less than 1 4-40%

In [None]:
from sklearn.preprocessing import RobustScaler
robust = RobustScaler()
df_robust = pd.DataFrame(robust.fit_transform(df),columns=df.columns)
df_robust

In [None]:
plt.hist(df_robust['Age'],bins=20)

## 4. Guassian Transformation 
Used in KNN etc

Some machine learning algorithms like linear and logistic assume that the features are normally distributed 

-Accuracy 

-Performance

    a. Logarithmic Transformation 
    
    b. Reciprocal Transformation 
    
    c. Square Root Transformation 
    
    d. Exponential Transformation
    
    e. Box Cox Transformation

In [None]:

df=pd.read_csv('../input/titanic/train.csv',usecols=['Age','Fare','Survived'])
df.head()

In [None]:
### fillnan
df['Age']=df['Age'].fillna(df['Age'].median())

In [None]:
df.isnull().sum()

In [None]:
import scipy.stats as stat
import pylab

In [None]:
#### If you want to check whether feature is guassian or normal distributed
#### Q-Q plot
def plot_data(df,feature):
    plt.figure(figsize=(10,6))
    plt.subplot(1,2,1)
    df[feature].hist()
    plt.subplot(1,2,2)
    stat.probplot(df[feature],dist='norm',plot=pylab)
    plt.show()

In [None]:
plot_data(df,'Age') #To check if the coordinates are following the same line (red line)

### Logarithmic Transformation
For right skewed it is better

In [None]:
df['Age_log']=np.log(df['Age'])
plot_data(df,'Age_log')

### Reciprocal Transformation

In [None]:
df['Age_reciprocal']=1/df.Age
plot_data(df,'Age_reciprocal')

### Square Root Transformation

In [None]:
df['Age_sqaure']=df.Age**(1/2)
plot_data(df,'Age_sqaure')

### Exponential Transdormation

In [None]:
df['Age_exponential']=df.Age**(1/1.2)
plot_data(df,'Age_exponential')

### BoxCOx Transformation
The Box-Cox transformation is defined as:

T(Y)=(Y exp(λ)−1)/λ

where Y is the response variable and λ is the transformation parameter. λ varies from -5 to 5. In the transformation, all values of λ are considered and the optimal value for a given variable is selected.

In [None]:
df['Age_Boxcox'],parameters=stat.boxcox(df['Age'])

In [None]:
print(parameters)

In [None]:
plot_data(df,'Age_Boxcox')