In [1]:
import pandas as pd
df = pd.read_csv('data')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.shape

(891, 12)

In [3]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
df = df.loc[df.Embarked.notna(), ['Survived', 'Pclass', 'Sex', 'Embarked']]
df.shape


(889, 4)

In [8]:
#i have lost two rows
#lets check null values in above mentioned features
df.isna().sum()


Survived    0
Pclass      0
Sex         0
Embarked    0
dtype: int64

In [9]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S


In [10]:
#there are three categorical variables, Pclass, Sex and Embarked
#but for this case we will look into only Sex and Embarked
#i will start with cross validating model that predicts Survived with only Pclass
#and then we will use pipeline
X = df.loc[:, ['Pclass']]
y = df.Survived

In [11]:
X.shape

(889, 1)

In [12]:
y.shape

(889,)

In [13]:
#Since this is Classification Problem, we will use Logistic Regression
from sklearn.linear_model import LogisticRegression

In [14]:
logreg = LogisticRegression(solver = 'lbfgs')#always try to use this solver, sometimes it might throw errors


In [15]:
#lets evaluate our model
from sklearn.model_selection import cross_val_score

In [16]:
#Lets cross validate my logistic model, using just one FEATURE
cross_val_score(logreg, X, y, cv = 5, scoring = 'accuracy').mean()
#accuracy is 67%
#mean of 5cross validation

0.6783406335301212

In [17]:
y.value_counts(normalize = True)#compare with null accuracy which is 61%
#nul accuracy is accuracy by predicting the most frequent clas
#in most cases u dont have to do this

0    0.617548
1    0.382452
Name: Survived, dtype: float64

In [18]:
#what i am doing so far is to build my cross validated model

#Now i want to add more features to my model and cross validate it
#using pipleine, we can add more features
#but before that, we have to encode sex and embarked column
#we will do, dummy encoding or one hot encoding
#scikit learn calls it one hot encoding
#pandas calls it dummy encoding
#its the same thing
#we will do it in scikit learn

from sklearn.preprocessing import OneHotEncoder

#now we instantiate OnehotEncoder, to make an instance of it
ohe = OneHotEncoder(sparse = False)#just for teaching purpose, sparse is used, but in realworld never its used



In [19]:
ohe.fit_transform(df[['Sex']])


array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [20]:
#here first column represents females, second represents Male
ohe.categories_
#there fore first three rows
#Male
#Female
#Female
#Confirm using the original dataset
#Line 20 and 21 is just for teaching purpose
#do this for other columns

#i have learnt using pandas, and earlier doing it in scikit learn it was painful
#now its easier to do it scikit



[array(['female', 'male'], dtype=object)]

In [21]:
#Earlier, pclass would be removed and in Xclass would be given these dummy colmns
#but now we shall use pipeline

#lets define now our new X with three features
#previous X feature was with one feature
X = df.drop('Survived', axis = 'columns')

In [22]:
X.head()

Unnamed: 0,Pclass,Sex,Embarked
0,3,male,S
1,1,female,C
2,3,female,S
3,1,female,S
4,3,male,S


In [23]:
#now we do column transform
from sklearn.compose import make_column_transformer


In [24]:
#we do column transformer, when u have features that needs pre processing
#what this means, dummy encoding is preprcoessing step
#i want to apply hotencoding to only sex and embarked, not to plcass, 
#pclass we are now considering as a numeric colmn/variable

#therefore we use now colm transformer
column_trans = make_column_transformer(
    (OneHotEncoder(), ['Sex', 'Embarked']), #i want to apply Onehot...to those columns
    remainder = 'passthrough') #and the remainder of my columns i want to pass through, pclass will passthrough



In [25]:
column_trans.fit_transform(X)

array([[0., 1., 0., 0., 1., 3.],
       [1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 3.],
       ...,
       [1., 0., 0., 0., 1., 3.],
       [0., 1., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 3.]])

In [None]:
#first two columns are of Sex(0, 1), 
#the next three columns (0,0,1) are embarked
#and the final column is the pass through of pclass


In [26]:
#we are finally at our pipeline step
from sklearn.pipeline import make_pipeline

In [27]:
pipe = make_pipeline(column_trans, logreg) #remember, pipleine is for chainning steps together
#here now we are building my model


In [28]:
#now i am going to pass my entire pipeline to my entire cross val
cross_val_score(pipe, X, y, cv = 5, scoring='accuracy').mean()
#here i am not crossvalidating a model
#i am passing the pipeline which includes preprocessing of my data and model building
#cross val will split my data in x and y and then will run the pipeline
#here X is matrix
#y is vector

#the point to do cross validation is to evaluate ur model so that you can predict on your new data


0.7727924839713071

In [None]:
#adding my two features improved my model to 77% from previously, it was 67%


In [29]:
#now lets use some new data to see the prediction
X_new = X.sample(5, random_state=99)#i am sampling from my X, i am not supposed to be using Train X values, but still using it
X_new 

Unnamed: 0,Pclass,Sex,Embarked
599,1,male,C
512,1,male,S
273,1,male,C
215,1,female,C
790,3,male,Q


In [30]:
#normally i would model.fit, but in this case i dont have a model
#i have a pipeline
pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                n_values=None,
                                                                sparse=True),
                                                  ['Sex', 'Embarked'])],
                      

In [31]:
pipe.predict(X_new)

array([1, 0, 1, 1, 0])

In [None]:
#X_new has strings in there
#reember, hot encoder doesnt make changes in our original dataset
#when new data comes in, you dont have to use dummies on it
#lets say, our insample data had cqs, 
#and output sample data, had cs
#therefore it wont create a proper dataframe

#if u do preprocessing outside of encoder, cross val scores become less reliable
