In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt     #importing the basic data Science Libraries like numpy pandas matplotlib and seaborn
import seaborn as sns

from sklearn.model_selection import train_test_split               #for splitting the data into test and training data
from sklearn.compose import ColumnTransformer                       #for transforming the columns
from sklearn.impute import SimpleImputer                             #for imputing the missing values
from sklearn.preprocessing import OneHotEncoder                      #one hot encoding
from sklearn.preprocessing import MinMaxScaler                        #standard scaling
from sklearn.pipeline import Pipeline,make_pipeline                    #here we wont use pipelines
from sklearn.feature_selection import SelectKBest,chi2                 #feature selection

from sklearn.tree import DecisionTreeClassifier

In [None]:
from google.colab import files    # we are importing the file from the device
uploaded = files.upload()

In [None]:
df=pd.read_csv('Titanic-Dataset.csv')   #fitting the data in the df dataframe
df.head()

In [None]:
df.drop(['PassengerId', 'Name', 'Ticket' , 'Cabin'], inplace=True , axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Survived', axis=1),
                                                    df['Survived'], test_size=0.2,
                                                    random_state=42)

In [None]:
X_train.head()

In [None]:
y_train.head()

IMPUTATION

In [None]:
# imputation transformer
trf1=ColumnTransformer([
  ('impute_age',SimpleImputer(),[2]),                               # imporitng the age instead of writing the column name we gave the column positon
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6]) #6th position of the arrary is for embarked
],remainder='passthrough')

most frequent means Median , it replaces the missing values with Median

ONE HOT ENCODING

In [None]:
#one hot encoding
trf2=ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown ='ignore', drop='first'),[1,6])  # one hot encoding for sex in pos 1  and embarked in pos 6
],remainder='passthrough')                                                                         #drop first bascially makes the One-hot-Encoding itnto DUMMY ENCODING
                                                                                           #we may not use drop-first as in decesion tree model it doesnot really matter

sparse=False: This parameter specifies that the output should be a dense array instead of a sparse matrix.


drop='first': This parameter specifies that for each feature, the first category should be dropped to avoid multicollinearity in the features.

remainder='passthrough': This parameter specifies what to do with columns that are not explicitly transformed. Here, 'passthrough' means that those columns will be passed through without any transformation.

FEATURE SCALING

In [None]:
# feature scaling
trf3=ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))       #min max scaling  on all columns in the range 0 to 8 we have not done it on categorical data but we did it anyways
])

Z-Score Normalization , it shifts the value from  0 to 1

We are using MinMax Scaler instead of Standard Scaler  as we are also doing Feature Selection - In feature selection we always use MinMax

FEATURE SELECTION

In [None]:
#feature selection
trf4=SelectKBest(score_func=chi2,k=7)     #we dont have to know everything about it right now but future mein we gonna get to know about it more

Basicallt we are using the top 8 columns of the data

# TRAINING THE MODEL

In [None]:
trf5=DecisionTreeClassifier()

# Creating the Pipelines

we did all the parts separatly now we going to Join all the parts

1.   Imputation
2.   One-Hot Encoding
3.   Feature Scaling
4.   Feature Selection  



In [None]:
from  sklearn.pipeline import Pipeline,make_pipeline

from os import pipe
pipe= Pipeline([
    ('trf1',trf1),                       #we Joiend everyhting together
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

In the above code we are sending the list of tuple
1st we are sending the transformer and creating the pipeline object

In [None]:
# ALTERNATIVE WAY

#pipe=make_pipeline(trf1,trf2,trf3,trf4,trf5)       #we passed just the names of the objects thats it


pipeline requires naming the steps , make_pipeline doesnot

(same applies to Column Transformer vs make_column_transformer)

In [None]:
#training the pipeline

pipe.fit(X_train,y_train)

VISUAL DIAGRAM

In [None]:
#disply

from sklearn import set_config
set_config(display='diagram')

# A Few Learning

if we have 3 steps imputation , OHE and scaling , we didnot have any algorithm then We wont just call the fit function

We would call fit_transform function
Here we do the 2 things at one go as we just did preprossing we didnot do model training
so we do 2 things at once so that after we can call the predict function

TYPES OF PIPELINES

WITH ALGORITHM WE USE FIT AND PREDICT

WITHOUT ALGORITHM WE USE FIT AND TRANSFORM

# Pipeline EXPLORATION  

In [None]:
pipe.named_steps       #this tells us all the steps that this pipeline follows

In [None]:
pipe.named_steps['trf1'].transformers_

this tells us 2nd postion we imputed for age and 6th position we imputed for embarked and passed through the others , it is basically describes the transformers

In [None]:
pipe.named_steps['trf1'].transformers_[0]  #gives the age part of the imputer that is the first part

In [None]:
pipe.named_steps['trf1'].transformers_[0][1]     #it shows that this has the simple imputer object

In [None]:
pipe.named_steps['trf1'].transformers_[0][1].statistics_    #statistics is the attribute

we did it for age we can also do it for the embarked column

In [None]:
pipe.named_steps['trf1'].transformers_[1][1].statistics_

S comes most frequently

# The above helps a lot in Backtracking and Debugging

# We can do experiments with  tranformers and do Post-mortem of the code

we can do better job than kolkata police at it

# Prediction  and Accuracy time!!!

In [None]:
y_pred=pipe.predict(X_test)


In [None]:
y_pred

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

# Cross Validation with pipeline

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy').mean()

In [None]:
# 60 perc after Cross Validation

#Horrendous reseult after 2 days of work

# GridSearch using Pipeline

HYPER PARAMETER TRAINING WE WILL LEARN ABOUT THIS IN FUTURE

2 PARTS OF ML TUNING THE FEATURE AND TUNING THE MODEL

like depth of the Des

In [None]:
#gridsearch CV
params = {

          'trf5__max_depth':[1,2,3,4,5,None]     # for these 6 values it will try and gridsearch cv automatically chooses that
}                                                 #tr5f is the name of the model


In [None]:
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(pipe,params,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)

In [None]:
grid.best_score_          #accuracy after GridSearch CV

We getting kind of the same accuracy   60%   so ya

Learning done .

In [None]:
grid.best_params_

#  Exporting Pipelines

In [None]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))    #what if we wanna use this in production code

we dont have to bring all the transformers separatly , it is really stored in the pipelines so we just have to do it once

Makes our work soo much easier

In [None]:
pipe=pickle.load(open('pipe.pkl','rb'))  #loading pipeline

In [None]:
#User input and prediction

test_input2 = np.array([3, 'female', 32.0, 0, 0, 11.5, 'S'],dtype=object).reshape(1,7)

pipe.predict(test_input2)

Prediction is THE PERSON WILL Not survive


If we change some code during the transformation , we dont have to make any changes in the pickle part and it takes the ENTIRE PIPELINE

AND IN THE ENTIRE PIPELINE changes will be automatically aggregated.