# Table of Contents
1. [Setup](#section1)<br>
    1.1 [Installing Commands](#section11)<br>
    1.2 [Importing Libraries](#section12)<br>
    1.3 [OS package commands](#section13)<br>
    1.4 [Data Acquisition](#section14)<br>
2. [Pre-Processing](#section2)<br>
3. [Modeling](#section3)<br>
4. [Model Evaluation](#section4)<br>
5. [ML Pipeline](#section5)<br>

<a id=section1></a> 
# 1. Setup

<a id=section11></a> 
## Install a package command

In [None]:
# !pip install -q <package_name>
# !pip install -q datascience
# !pip install -q pandas-profiling
# !pip install -q yellowbrick
# -q means quiet install
# !pip install google-colab

<a id=section12></a> 
## Import Packages commands

In [3]:
# Data Analysis packages
import pandas as pd
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', 500) # OR pd.options.display.max_rows = 500
pd.set_option('display.max_columns', 500) # OR pd.options.display.max_columns = 500
pd.options.display.float_format = "{:,.2f}".format # to avoid seeing exponential data in describe output
import pandas_profiling 
import numpy as np


# Data Visualization packages
import matplotlib.pyplot as plt
plt.rc("font", size=14)
%matplotlib inline
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

# Other useful packages
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import os
from tqdm import tqdm, tqdm_notebook
from subprocess import check_output
from pydotplus.graphviz import graph_from_dot_data
from graphviz import Source

# Sklearn API
from sklearn import datasets
from sklearn.tree import export_graphviz
from sklearn.pipeline import make_pipeline
from sklearn.externals import joblib

# Sklearn Pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Sklearn Preprocesing
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

#Common Model Helpers
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_validate
from sklearn import metrics


#Importing Regression Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm


#Importing Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
#XGBOOST
from xgboost import XGBClassifier
# import lightgbm as lgb
from xgboost import plot_importance, plot_tree

#Importing Unsupervised Models
from sklearn.cluster import KMeans

# Classification Algo Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from yellowbrick.classifier import PrecisionRecallCurve

# Regression Algo Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



#Pipeline with Pandas
import pdpipe as pdp

# Stats API
import scipy.stats as stats
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
import statsmodels.formula.api as smf
from scipy.stats import randint as sp_randint


from IPython import get_ipython
ipython = get_ipython()

# If in ipython, load autoreload extension
if 'ipython' in globals():
    print('\nWelcome to IPython!')
    ipython.magic('load_ext autoreload')
    ipython.magic('autoreload 2')

# Display all cell outputs in notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


print('Your favorite libraries have been loaded.')


Welcome to IPython!
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Your favorite libraries have been loaded.


<a id=section13></a> 
## OS package commands

In [10]:
print(os.listdir("/Users/rajkgupta/DATASCIENCE"))

['Statistics', 'ExpediaGroupDataScienceAcademy', 'gitrepos', 'UdemyCourses', '.DS_Store', 'PythonandS3', 'Tableau', 'NaturalLanguageProcessing', 'Computer Vision', 'Python', 'Data Visualization', 'R', 'Fee Receipt', 'MachineLearning', 'Numpy', 'EDA Project', 'PythonandStatistics', 'PyCharmProjects', 'INSAIDGCDProgramSyllabus', 'YoutubeVideos', 'AboutDataScience', 'Useful EDA commands.ipynb', 'WebScrappingScript', 'PythonForFinance', 'PythonDSPresentations', '.ipynb_checkpoints', 'PythonPractice', 'Deep Learning', 'Assignments - Numpy and Pandas', 'ODSCMeetup', 'Data', 'PythonandSparkforBigData', 'Pandas', 'AnalyticsLab', 'Career Guide']


In [None]:
DATA_PATH = os.path.join(os.getcwd(), 'data', 'titanic')

<a id=section14></a> 
## Read the dataset 

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/insaid2018/Term-2/master/CaseStudy/Advertising.csv', index_col=0)


In [None]:
data = pd.read_csv(filepath_or_buffer='https://storage.googleapis.com/industryanalytics/trans_fraud_data.csv')


## Reading from Google Colab 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
diabetes = pd.read_csv('/content/drive/My Drive/TensorFlow/diabetes.csv')

<a id=section2></a> 
# 2. Pre-Processing

In [None]:
df.head()
df.describe()
df.info()
df.shape
display(df.head())
df['column_name'].mode()
df = df.drop(['column_name'], axis = 1)
df.drop(['column_name'], axis = 1, inplace=True)
median_column = df.column_name.median()
df.column_name = df.column_name.fillna(df['column_name'].mode()[0])
df['Age'].fillna(df['Age'].dropna().median(), inplace=True)
df.column_name = df.apply(lambda x: 'child' if x['Age'] < 15 else x['Sex'],axis=1)
df = pd.get_dummies(df, columns=['col1','col2'], drop_first=True)
df = pd.concat([df1, df2],axis=1)
df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} 
df['Title'] = df['Title'].map(title_mapping)
data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col'], 'Rare')
# Categorizing Numerical Value 
data['FareBand'] = pd.qcut(data['Fare'], 4).astype(str)
dataDump  = df.copy()
df.skew()

# Combining the categories inside single column
def combine(x):
    if x in ['FV', 'RH']: 
        return 'FRC'
    elif 'C' in x: 
        return 'FRC'
    else:
        return x
prices['MSZoning'] = prices['MSZoning'].apply(lambda x : combine(x))

In [11]:
# Create Data audit Report for continuous variables
def continuous_var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  
                      x.std(), x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),
                          x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75), 
                              x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max()], 
                  index = ['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1', 
                               'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

In [16]:
# Create Data audit Report for categorical variables
def categorical_var_summary(x):
    Mode = x.value_counts().sort_values(ascending = False)[0:1].reset_index()
    return pd.Series([x.count(), x.isnull().sum(), Mode.iloc[0, 0], Mode.iloc[0, 1], 
                          round(Mode.iloc[0, 1] * 100/x.count(), 2)], 
                  index = ['N', 'NMISS', 'MODE', 'FREQ', 'PERCENT'])

In [15]:
# Missing value imputation for categorical and continuous variables
def missing_imputation(x, stats = 'mean'):
    if (x.dtypes == 'float64') | (x.dtypes == 'int64'):
        x = x.fillna(x.mean()) if stats == 'mean' else x.fillna(x.median())
    else:
        x = x.fillna(x.mode())
    return x

In [14]:
# An utility function to create dummy variable
def create_dummies(df, colname):
    col_dummies = pd.get_dummies(df[colname], prefix = colname, drop_first = True)
    df = pd.concat([df, col_dummies], axis = 1)
    df.drop(colname, axis = 1, inplace = True )
    return df

### Splitting Pandas column into multiple columns using Get Dummies 

In [None]:
df[col_name].str.get_dummies(sep=';')

In [None]:
df[col_name].str.split(";",expand=True)

### Kaggle Combine and Separate Train and Test

In [None]:
test_Id = df_test.Id
train_Id = df_train.Id
df_train.drop(['Id'], axis=1, inplace=True)
df_test.drop(['Id'], axis=1, inplace=True)

train_index = df_train.shape[0]
test_index = df_test.shape[0]
train_Target = df_train.Target
all_data = pd.concat((df_train, df_test)).reset_index(drop=True)
all_data.drop([target_col], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))


df_train = all_data[:train_index]
df_test = all_data[train_index:]



In [None]:
# submissions
sub = pd.DataFrame()
sub['Id'] = test_Id
sub['SalePrice'] = test_SalePrice
sub.to_csv('submission.csv',index=False)


## sepearate categorical and numerical features

In [None]:
# seperate categorical and continuous variables
df_cont = df.loc[:, (df.dtypes == 'float64') | (df.dtypes == 'int64')]
df_cat = df.loc[:, (df.dtypes == 'object')]

# Simper way of doing:
df_cont = df.select_dtypes(include = ['float64', 'int64'])
df_cat = df.select_dtypes(include = ['object'])

In [None]:
cars_conti_vars.apply(continuous_var_summary).T.round(1)
cars_cat_vars.apply(categorical_var_summary).T

## Change the data types 

In [None]:
df[col] = df[col].astype('category')
df[col] = df[col].astype('int64')
df.dtypes.unique()

1. category
2. object
3. int64
4. float64

## Missing Values Imputation

In [None]:
null_frame = pd.DataFrame(data.isnull().sum(), columns = ['Frequency'])
null_frame.transpose()

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')

In [None]:
rownum = 59381.0
desc = df.describe().T
desc[desc['count']!=rownum]

In [None]:
cars_conti_vars = cars_conti_vars.apply(missing_imputation)
cars_cat_vars = cars_cat_vars.apply(missing_imputation)

In [None]:
dataset_na = (insurance_df.isnull().sum() / len(insurance_df)) * 100
dataset_na = dataset_na.drop(dataset_na[dataset_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :dataset_na})
missing_data.head(20)

In [None]:
missing_threshold = 90
prices_missing_threshold = (prices.isnull().sum() / len(prices)) * 100
for col in prices.columns:
    if prices_missing_threshold.loc[col]>thrshold_na:
        prices.drop(col, axis=1, inplace=True)

In [None]:
null_frame = pd.DataFrame(index = data.columns.values)
null_frame['Null Frequency'] = data.isnull().sum().values
percent = data.isnull().sum().values/data.shape[0]
null_frame['Missing %age'] = np.round(percent, decimals = 4) * 100
null_frame.transpose()

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.impute
https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

https://github.com/awslabs/datawig

Pandas.fillna

Sklearn.impute

AWS Datawig

# Encoding

https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/#backward

https://contrib.scikit-learn.org/categorical-encoding/

* Backward Difference Coding
* hashing
* leave one out
* one hot
* ordinal
* polynoimial coding
* target encoder

## Identification & Handling of Duplicate Rows

In [None]:
print('Contains Duplicate Rows?', data.duplicated().any())

## Feature Engineering 

### Feature Selection 

#### Correlation based 

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(df.corr(),cmap='Blues',annot=False) 

In [None]:
def get_high_corr_features(df, threshold):
    corr_features = set()
    # create the correlation matrix (default to pearson)
    corr_matrix = df.corr()
    for i in range(len(corr_matrix .columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colnamei = corr_matrix.columns[i]
                colnamej = corr_matrix.columns[j]
                corr_features.add( (colnamei,colnamej) )
                # corr_features.append(colnamej)
    return corr_features  

### Feature Reduction/Decomposition

### Feature Addition 

## Outlier 

In [None]:
l = df.columns.values
number_of_columns=12
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.set_style('whitegrid')
    sns.boxplot(df[l[i]],color='green',orient='v')
    plt.tight_layout()

In [None]:
outliers=[]
def detect_outlier(data_1):
    
    threshold=3
    mean_1 = np.mean(data_1)
    std_1 =np.std(data_1)
    
    
    for y in data_1:
        z_score= (y - mean_1)/std_1 
        if np.abs(z_score) > threshold:
            outliers.append(y)
    return outliers

In [None]:
cars_conti_vars = cars_conti_vars.apply(lambda x: x.clip(lower = x.dropna().quantile(0.01), 
                                                         upper = x.quantile(0.99)))

## Check individual Column distribution (Plotly Express) 

### Heatmap 

In [None]:
sns.heatmap( data.corr(), annot=True )

In [None]:
#Quality correlation matrix
k = 12 #number of variables for heatmap
cols = df.corr().nlargest(k, 'quality')['quality'].index
cm = df[cols].corr()
plt.figure(figsize=(10,6))
sns.heatmap(cm, annot=True, cmap = 'viridis')

In [None]:
corr = titanic_data.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr,vmax=.8,linewidth=.01, square = True, annot = True,cmap='YlGnBu',linecolor ='black')
plt.title('Correlation between features')

In [None]:
figure = plt.figure(figsize = [12, 8])
sns.heatmap(data_transfer.corr(), annot = True, cmap = 'YlGnBu')
plt.title('Correlation between Features', size = 16)
plt.show()

### Pairplot 

In [None]:
sns.pairplot(data, size = 2, aspect = 1.5)
sns.pairplot(glass, hue="Type")

In [None]:
sns.pairplot(data, x_vars=['TV', 'radio', 'newspaper'], y_vars='sales', size=5, aspect=1, kind='reg')

In [None]:
sns.pairplot(titanic_data[["Fare","Age","Pclass","Survived"]],vars = ["Fare","Age","Pclass"],hue="Survived", dropna=True,markers=["o", "s"])
plt.title('Pair Plot')

In [None]:
import seaborn as sns; sns.set(style="ticks", color_codes=True)
iris = sns.load_dataset("iris")
g = sns.pairplot(iris,hue='species')

### Box plot 

In [None]:
age_labels = review_data_k_means[['labels','Age']]
age_labels.boxplot(by='labels',figsize=(20,10))
plt.xticks(rotation=90)
plt.show()

### Scatter Plot 

In [None]:
sns.FacetGrid(glass, hue="Type", size=5) \
   .map(plt.scatter, "Fe", "Ba") \
   .add_legend()

### Distribution Plot 

In [None]:
f, axes = plt.subplots(2, 2, figsize=(7, 7), sharex=True)                                      # Set up the matplotlib figure
sns.despine(left=True)
sns.distplot(data.sales, color="b", ax=axes[0, 0])
sns.distplot(data.TV, color="r", ax=axes[0, 1])
sns.distplot(data.radio, color="g", ax=axes[1, 0])
sns.distplot(data.newspaper, color="m", ax=axes[1, 1])

In [None]:
sns.set_style('darkgrid')
plt.title('Distribution of Each Column in the Data')
for i,col in enumerate(review_data_k_means.columns):
    plt.figure(i)
    sns.distplot(review_data_k_means[col])

### Jointplot 

In [None]:
JG1 = sns.jointplot("newspaper", "sales", data=data, kind='reg')
JG2 = sns.jointplot("radio", "sales", data=data, kind='reg')
JG3 = sns.jointplot("TV", "sales", data=data, kind='reg')
#subplots migration
f = plt.figure()
for J in [JG1, JG2,JG3]:
    for A in J.fig.axes:
        f._axstack.add(f._make_key(A), A)

### Skewness distribution

In [None]:
plt.figure(figsize=(2*number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.distplot(df[l[i]],kde=True) 

### Histogram 

In [None]:
df['col_name'].hist()

### Plotting Andrews Curve

In [None]:
!pip install pandas.tools
from pandas.plotting import andrews_curves
andrews_curves(glass[glass['Type'].isin([2,3,4,5,6])], "Type")

## Transformation of Columns 

In [None]:
df['col_name']=np.log(df['col_name'])

## Scaling of Columns 

In [None]:
scaler = StandardScaler().fit(data)
data1 = scaler.transform(data)
data = pd.DataFrame(data1)

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
sent_scaled = min_max_scaler.fit_transform(review_data_k_means[['Sentiment']].values)
#df_normalized = pd.DataFrame(x_scaled)
sent_scaled
review_data_k_means['Sentiment_Norm'] = sent_scaled

## Handle the Imbalance in Classification Problem

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 33)
X_train_new, y_train_new = sm.fit_sample(X_train, y_train.ravel())
pd.Series(y_train_new).value_counts().plot.bar()

## Progress Apply for showing progress bar

In [None]:
tqdm_notebook().pandas()
movies.progress_apply(lambda x: calculcateNewRating(x['Genre'],x['Rating']),axis=1)

## Pandas Profiling

In [2]:
db = pd.read_csv('/Users/rajkgupta/Downloads/diabetes.csv')
report = pandas_profiling.ProfileReport(db)
report.to_file(output_file = '/Users/rajkgupta/Downloads/pre_profilereport_diabetes.html')

HBox(children=(FloatProgress(value=0.0, description='variables', max=9.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='correlations', max=6.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='interactions [continuous]', max=64.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='missing', max=2.0, style=ProgressStyle(description_width=…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…




<a id=section3></a> 
# 3. Modeling

### Imbalanced Classification Handling 

An approach would be oversampling the minority class using SMOTE (Synthetic Minority Oversampling Technique) present in python library known as imbalanced-learn

### Train Test Split 

In [None]:
X = df.loc[:,df.columns != 'target_name']
y = df['target_name']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

### Model Initialization

In [None]:
linreg = LinearRegression()
logreg = LogisticRegression()
dt = DecisionTreeClassifier(random_state = 0,criterion="entropy")
rf = RandomForestClassifier(random_state = 0)
knn = KNeighborsClassifier(n_neighbors=13, p=2, metric='minkowski',n_jobs=-1) # Euclidean  # 3, 7, 13, 20 
regressor = KNeighborsRegressor(n_neighbors=15)


### Plotting model of decision tree

In [None]:
graph = Source(export_graphviz(dt, out_file=None,filled=True, 
                                rounded=True,  
                                special_characters=True, feature_names=X.columns))
graph.format = 'png'
graph.render('dtree_render',view=True)

### Hyper-parameter tuning 

In [None]:
# define the parameters , it should always be a dictionary 
parameters = {
                'normalize':[True,False], 
                'copy_X':[True, False], 
                'fit_intercept':[True,False]
             }
linreg = LinearRegression() # this the model on which i would want to experiment 
# Call the GridSearch Class, Pass the model and parameter 
linreg = GridSearchCV(linreg,parameters)
linreg.best_estimator_

In [None]:
grid={
        "C":np.logspace(-3,3,7), 
        "penalty":["l1","l2"], # l1 lasso l2 ridge
        "tol":[0.01,0.001,0.0001]  
      } 
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train,y_train)
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

In [None]:
tree_para = [
            {
                'criterion':['gini','entropy'],
                'max_depth': range(2,60),
                'max_features': ['sqrt', 'log2', None] 
            }
            ]
grid_search = GridSearchCV(dt,tree_para, cv=10, refit='AUC')
grid_search.fit(X_train, y_train)
grid_search.best_estimator_

In [None]:
param_dist = {"max_depth": range(2,20),
              "min_samples_split": sp_randint(5, 25),
              "min_samples_leaf": sp_randint(5, 20),
              "bootstrap": [True, False],
              "n_estimators": [20,50,100, 400, 700, 1000, 1500],
              "criterion" : ["gini", "entropy"],
              'max_features': ['sqrt', 'log2', None]
             }
# run randomized search
n_iter_search = 100
random_search = RandomizedSearchCV(rf, param_distributions = param_dist,
                                   n_iter = n_iter_search,
                                   n_jobs = -1)

random_search.fit(X_train, y_train)
random_search.best_estimator_

### Model Fitting and Predictions 

In [None]:
linreg.fit(X_train, y_train)
y_pred_test = linreg.predict([X_test)

In [None]:
y_pred_proba = logreg.predict_proba(X_train)[0]

### K means Elbow Method 

In [None]:
import pylab as pl
number_of_clusters = range(5,40)
kmeans = [KMeans(n_clusters=i,max_iter=1000,random_state=42) for i in number_of_clusters]
score = [-1*kmeans[i].fit(review_data_std).score(review_data_std) for i in range(len(kmeans))]
pl.plot((number_of_clusters),score)
pl.xlabel('Number of Clusters')
pl.ylabel('Score')
pl.title('Elbow Curve')
pl.show()

In [None]:
k_means_test = KMeans(n_clusters=6,max_iter=5000,random_state=42)
k_means_test.fit(review_data_std).score(review_data_std)
k_means_test.labels_

### Get Sentiment 

In [None]:
def getSentiment(eachRow):
  TotalSum = []
  for eachWord in eachRow.split():
    if eachWord.lower() in d:   # if it is found in the dictionary 
      TotalSum.append(int(d[eachWord.lower()]))
    else:
      TotalSum.append(0)
  return np.sum(TotalSum)
  # return TotalSum


review_data_k_means['Sentiment'] = review_data_k_means['Review Text'].apply(getSentiment)

In [None]:
from textblob import TextBlob

def getSentimentTextBlob(eachRow):
  return TextBlob(eachRow).sentiment.polarity

review_data_k_means['Sentiment_textblob'] = review_data_k_means['Review Text'].apply(getSentimentTextBlob)

<a id=section4></a> 
# 4. Model Evaluation

## MAE 

In [None]:
MAE_train = metrics.mean_absolute_error(y_train, y_pred_train)
MAE_test = metrics.mean_absolute_error(y_test, y_pred_test)
print('MAE for training set is {}'.format(MAE_train))
print('MAE for test set is {}'.format(MAE_test))

## MSE 

In [None]:
MSE_train = metrics.mean_squared_error(y_train, y_pred_train)
MSE_test = metrics.mean_squared_error(y_test, y_pred_test)
print('MSE for training set is {}'.format(MSE_train))
print('MSE for test set is {}'.format(MSE_test))

## RMSE 

In [None]:
RMSE_train = np.sqrt( metrics.mean_squared_error(y_train, y_pred_train))
RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
print('RMSE for training set is {}'.format(RMSE_train))
print('RMSE for test set is {}'.format(RMSE_test))

In [None]:
print("Mean cross-validated score of the best_estimator : ", linreg.best_score_)  # In case of Grid Search


## Model Coeficients 

In [None]:
print('Intercept:',linreg.intercept_)                                           # print the intercept 
print('Coefficients:',linreg.coef_)

In [None]:
feature_cols.insert(0,'Intercept')
coef = linreg.coef_.tolist()
coef.insert(0, linreg.intercept_)
eq1 = zip(feature_cols, coef)

for c1,c2 in eq1:
    print(c1,c2)

## R-Square 

In [None]:
yhat = linreg.predict(X_train)
SS_Residual = sum((y_train-yhat)**2)
SS_Total = sum((y_train-np.mean(y_train))**2)
r_squared = 1 - (float(SS_Residual))/SS_Total
adjusted_r_squared = 1 - (1-r_squared)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
print(r_squared, adjusted_r_squared)

## Accuracy Score 

In [None]:
print('Accuracy score for test data is:', accuracy_score(y_test,y_pred_test))

# Using predict_proba
preds1 = np.where(logreg.predict_proba(X_test)[:,1]> 0.75,1,0)
print('Accuracy score for test data is:', accuracy_score(y_test,preds1))
preds2 = np.where(logreg.predict_proba(X_test)[:,1]> 0.25,1,0)
print('Accuracy score for test data is:', accuracy_score(y_test,preds2))

## Confusion Matrix 

In [None]:
confusion_matrix = pd.DataFrame(confusion_matrix(y_test, y_pred_test))
confusion_matrix.index = ['Actual Died','Actual Survived']
confusion_matrix.columns = ['Predicted Died','Predicted Survived']
print(confusion_matrix)

## Precision/Recall/F1 Score 

In [None]:
print('Precision score for test data using model1 is:', precision_score(y_test,prediction1))
print('Recall score for test data using model1 is:',recall_score(y_test,prediction1)) 
print('F1_score for test data using model1 is:',f1_score(y_test, prediction1))

## Classification Report 

In [None]:
random_forest_report = classification_report(y_test, y_pred)
print(random_forest_report)

## ROC-AUC Curve 

In [None]:

probs = model1.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## Precision-Recall Curve 

In [None]:
def PRCurve(model):
  '''
  A function to compute Precision Recall Curve
  Data to fit must be training i.e. X_train, y_train
  Data score will be estimated on X_test, y_test
  '''
  viz = PrecisionRecallCurve(model)
  viz.fit(X_train, y_train)
  avg_prec = viz.score(X_test, y_test)
  plt.legend(labels = ['Binary PR Curve',"AP=%.3f"%avg_prec], loc = 'lower right', prop={'size': 14})
  plt.xlabel(xlabel = 'Recall', size = 14)
  plt.ylabel(ylabel = 'Precision', size = 14)
  plt.title(label = 'Precision Recall Curve', size = 16)

<a id=section5></a> 
# 5. ML Pipeline Code

In [None]:
Target = ['Survived']
data1_x_bin = pd.get_dummies(df2)

# X = data1_x_bin
# y = Target
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()    
    ]

#split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
#note: this is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(n_splits = 5, test_size = .3, train_size = .7, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time', 'TrainTestDifference']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = data[Target]  # Y 

#index through MLA and save performance to table
row_index = 0
Feature_Importance = {}

for alg in MLA:

    #set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    
    #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
    cv_results = model_selection.cross_validate(alg, data1_x_bin, data[Target], cv  = cv_split,return_train_score=True,scoring='precision')

    # cv_result is a doctionary -> All the results of diff models are saved 
    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    #MLA_compare.loc[row_index, 'TrainTestDifference'] = cv_results['train_score'].mean() - cv_results['test_score'].mean() 

    #save MLA predictions - see section 6 for usage
    alg.fit(data1_x_bin, data[Target])

    try:
      Feature_Importance[MLA_name] = alg.feature_importances_
    except AttributeError:
      pass
      
    MLA_predict[MLA_name] = alg.predict(data1_x_bin)
    
    row_index+=1

    
#print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
MLA_compare['Difference'] = (MLA_compare['MLA Test Accuracy Mean']-MLA_compare['MLA Train Accuracy Mean'])*100
MLA_compare

#MLA_predict

In [None]:
MLA_compare.sort_values(by="Difference",ascending=False)