
# Agenda

    Why should you use a Pipeline?   
    How do you encode categorical features with OneHotEncoder?
    How do you apply OneHotEncoder to selected columns with ColumnTransformer?
    How do you build and cross-validate a Pipeline?
    How do you make predictions on new data using a Pipeline?
    Why should you use scikit-learn (rather than pandas) for preprocessing?



In [369]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer

In [370]:
df = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/titanic_train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [371]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [372]:
imp = SimpleImputer()  ##default strategy is "mean", fills n/a values with mea 
ohe = OneHotEncoder()
new_df = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/titanic_train.csv")

ct = make_column_transformer(
    (imp, ["Age"]),
    remainder="passthrough"
    )
ct_transformed = ct.fit_transform(new_df)
ct_transformed

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


array([[22.0, 1, 0, ..., 7.25, nan, 'S'],
       [38.0, 2, 1, ..., 71.2833, 'C85', 'C'],
       [26.0, 3, 1, ..., 7.925, nan, 'S'],
       ...,
       [29.69911764705882, 889, 0, ..., 23.45, nan, 'S'],
       [26.0, 890, 1, ..., 30.0, 'C148', 'C'],
       [32.0, 891, 0, ..., 7.75, nan, 'Q']], dtype=object)

In [411]:
imp = SimpleImputer(add_indicator=True)

new_df2 = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/titanic_train.csv")

ct = make_column_transformer(
    (imp, ["Age"]),
    )
ct_transformed = ct.fit_transform(new_df2)
ct_transformed

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


array([[22.        ,  0.        ],
       [38.        ,  0.        ],
       [26.        ,  0.        ],
       ...,
       [29.69911765,  1.        ],
       [26.        ,  0.        ],
       [32.        ,  0.        ]])

In [376]:
## different ways of doing make_column_transform
ct = make_column_transformer((imp,["Age"]))
# ct = make_column_transformer((imp,[5]))
# ct = make_column_transformer((imp,slice(5)))
# ct = make_column_transformer((imp,[False,False,False,False,False,True,False,False,False,False,False,False]))
# ct = make_column_transformer((imp, make_column_selector(dtype_include="number")))  ##use transformer on all numeric columns
# ct = make_column_transformer((imp,make_column_selector(dtype_exclude=object)))
# ct = make_column_transformer((imp, make_column_selector(pattern="A|Si"))) ##regex select age and SibSp columns
ct.fit_transform(new_df)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


array([[22.        ],
       [38.        ],
       [26.        ],
       [35.        ],
       [35.        ],
       [29.69911765],
       [54.        ],
       [ 2.        ],
       [27.        ],
       [14.        ],
       [ 4.        ],
       [58.        ],
       [20.        ],
       [39.        ],
       [14.        ],
       [55.        ],
       [ 2.        ],
       [29.69911765],
       [31.        ],
       [29.69911765],
       [35.        ],
       [34.        ],
       [15.        ],
       [28.        ],
       [ 8.        ],
       [38.        ],
       [29.69911765],
       [19.        ],
       [29.69911765],
       [29.69911765],
       [40.        ],
       [29.69911765],
       [29.69911765],
       [66.        ],
       [28.        ],
       [42.        ],
       [29.69911765],
       [21.        ],
       [18.        ],
       [14.        ],
       [40.        ],
       [27.        ],
       [29.69911765],
       [ 3.        ],
       [19.        ],
       [29

In [377]:
df = df.loc[:,['Survived', 'Pclass', 'Sex', 'Embarked']]
df.isna().sum()

Survived    0
Pclass      0
Sex         0
Embarked    2
dtype: int64

In [378]:
###let's get rid of na values
df["Embarked"].notna()

0      True
1      True
2      True
3      True
4      True
       ... 
886    True
887    True
888    True
889    True
890    True
Name: Embarked, Length: 891, dtype: bool

In [379]:
df = df.loc[df["Embarked"].notna(),['Survived', 'Pclass', 'Sex', 'Embarked']]
df.isna().sum()

Survived    0
Pclass      0
Sex         0
Embarked    0
dtype: int64

In [380]:
df

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S
...,...,...,...,...
886,0,2,male,S
887,1,1,female,S
888,0,3,female,S
889,1,1,male,C


# Cross-validate a Pipeline with all features

In [381]:
X = df.iloc[:,[1]]
X

Unnamed: 0,Pclass
0,3
1,1
2,3
3,1
4,3
...,...
886,2
887,1
888,3
889,1


In [382]:
y = df["Survived"]
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 889, dtype: int64

In [383]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logreg = LogisticRegression()

In [384]:
cross_val_score(logreg, X, y, cv=5, scoring='accuracy').mean()


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


0.6783406335301212

In [385]:
y.value_counts(normalize=True)

Survived
0    0.617548
1    0.382452
Name: proportion, dtype: float64

In [386]:
ohe = OneHotEncoder()  ##use sparse=False to show output as array for tutorial purposes 
ohe.fit_transform(df[["Embarked"]])  ##or use .toarray()  

<889x3 sparse matrix of type '<class 'numpy.float64'>'
	with 889 stored elements in Compressed Sparse Row format>

In [387]:
ohe.categories_

[array(['C', 'Q', 'S'], dtype=object)]

In [388]:
ohe.fit_transform(df[["Sex"]])
ohe.categories_

[array(['female', 'male'], dtype=object)]

# Cross-validate a Pipeline with all features

In [389]:
df

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S
...,...,...,...,...
886,0,2,male,S
887,1,1,female,S
888,0,3,female,S
889,1,1,male,C


In [390]:
X = df.drop(columns="Survived",axis=1)
X

Unnamed: 0,Pclass,Sex,Embarked
0,3,male,S
1,1,female,C
2,3,female,S
3,1,female,S
4,3,male,S
...,...,...,...
886,2,male,S
887,1,female,S
888,3,female,S
889,1,male,C


In [391]:
y = df["Survived"]
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 889, dtype: int64

In [392]:
column_trans = make_column_transformer((ohe,["Sex","Embarked"]),remainder="passthrough")
column_trans.fit_transform(X)


array([[0., 1., 0., 0., 1., 3.],
       [1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 3.],
       ...,
       [1., 0., 0., 0., 1., 3.],
       [0., 1., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 3.]])

In [393]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(column_trans,logreg)
pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Sex', 'Embarked'])])),
                ('logisticregression', LogisticRegression())])

In [394]:
cross_val_score(pipe, X, y, cv=4, scoring="accuracy").mean()

0.7739162929745889

In [395]:
###extra 
# X = column_trans.fit_transform(X)
# X

In [396]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
# logreg.fit(X_train,y_train)

In [397]:
# y_pred = logreg.predict(X_test)

In [398]:
# from sklearn.metrics import accuracy_score
# accuracy_score(y_test,y_pred)

In [399]:
X_new = X.sample(5,random_state=0)
X_new

Unnamed: 0,Pclass,Sex,Embarked
14,3,female,S
159,3,male,S
763,1,female,S
741,1,male,S
483,3,female,S


In [400]:
pipe.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Sex', 'Embarked'])])),
                ('logisticregression', LogisticRegression())])

In [401]:
pred = pipe.predict(X_new)
pred

array([1, 0, 1, 0, 1], dtype=int64)

In [402]:
###check the results for practice
df.loc[599,["Survived"]]

Survived    1
Name: 599, dtype: object

In [403]:
truth_sample_labels = []
for row in X_new.iterrows():
    truth_sample_labels.append(df.loc[row[0],"Survived"])
truth_sample_labels
truth_sample_labels = np.array(truth_sample_labels)
truth_sample_labels

array([0, 0, 1, 0, 1], dtype=int64)

In [404]:
from sklearn.metrics import accuracy_score
accuracy_score(truth_sample_labels,pred)

0.8

In [405]:
###quick recap
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

df = pd.read_csv('http://bit.ly/kaggletrain')
df = df.loc[df.Embarked.notna(), ['Survived', 'Pclass', 'Sex', 'Embarked']]
X = df.drop('Survived', axis='columns')
y = df.Survived

column_trans = make_column_transformer(
    (OneHotEncoder(), ['Sex', 'Embarked']),
    remainder='passthrough')
logreg = LogisticRegression(solver='lbfgs')

pipe = make_pipeline(column_trans, logreg)

cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.7727924839713071

X_new = X.sample(5, random_state=99)

pipe.fit(X, y)
pipe.predict(X_new)



array([1, 0, 1, 1, 0], dtype=int64)