In [91]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

In [92]:
df = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/titanic_train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [93]:
df.sample(6)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
115,116,0,3,"Pekoniemi, Mr. Edvard",male,21.0,0,0,STON/O 2. 3101294,7.925,,S
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
468,469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q
631,632,0,3,"Lundahl, Mr. Johan Svensson",male,51.0,0,0,347743,7.0542,,S
429,430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32.0,0,0,SOTON/O.Q. 392078,8.05,E10,S
411,412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q


In [94]:
X = df[["Age","Pclass","Sex"]]
y = df["Survived"]

In [95]:
ct = make_column_transformer(
    (OneHotEncoder(),["Sex"]),
    remainder="passthrough"
    )
ct.fit_transform(X)


array([[ 0.,  1., 22.,  3.],
       [ 1.,  0., 38.,  1.],
       [ 1.,  0., 26.,  3.],
       ...,
       [ 1.,  0., nan,  3.],
       [ 0.,  1., 26.,  1.],
       [ 0.,  1., 32.,  3.]])

In [96]:
pipe = make_pipeline(ct, SimpleImputer(), LogisticRegression())
pipe.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(), ['Sex'])])),
                ('simpleimputer', SimpleImputer()),
                ('logisticregression', LogisticRegression())])

In [97]:
X

Unnamed: 0,Age,Pclass,Sex
0,22.0,3,male
1,38.0,1,female
2,26.0,3,female
3,35.0,1,female
4,35.0,3,male
...,...,...,...
886,27.0,2,male
887,19.0,1,female
888,,3,female
889,26.0,1,male


In [98]:
X_new = X.sample(5, random_state=1)
X_new

Unnamed: 0,Age,Pclass,Sex
862,48.0,1,female
223,,3,male
84,17.0,2,female
680,,3,female
535,7.0,2,female


In [99]:
y[X_new.index]

862    1
223    0
84     1
680    0
535    1
Name: Survived, dtype: int64

In [100]:
preds = pipe.predict(X_new)
preds

array([1, 0, 1, 1, 1], dtype=int64)

In [101]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipe, X, y, cv=5, scoring="accuracy")

array([0.7877095 , 0.78651685, 0.79213483, 0.7752809 , 0.79213483])