In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_predict, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

  from pandas.core import (


In [2]:
df = pd.read_csv('train_titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df['family_size'] = df['SibSp'] + df['Parch']
df.drop(['Parch', 'SibSp',], axis = 1, inplace = True)
df['Isalone'] = 1
df.loc[df['family_size'] > 1, 'Isalone'] = 0

In [4]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,family_size,Isalone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,,S,1,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C85,C,1,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,,S,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,C123,S,1,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,,S,0,1


In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,family_size,Isalone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,,S,1,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C85,C,1,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,,S,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,C123,S,1,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,,S,0,1


In [6]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,family_size,Isalone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.2500,,S,1,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C85,C,1,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.9250,,S,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1000,C123,S,1,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.0500,,S,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,211536,13.0000,,S,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,112053,30.0000,B42,S,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,W./C. 6607,23.4500,,S,3,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,111369,30.0000,C148,C,0,1


In [7]:
features = ['Age', 'Pclass', 'Isalone', 'Sex', 'Fare', 'family_size', 'Embarked']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df[features],df['Survived'], test_size = 0.33, random_state = 42)

In [9]:
df.shape, X_train.shape, X_test.shape, y_train.shape

((891, 12), (596, 7), (295, 7), (596,))

In [10]:
cat_cols = ['Embarked', 'Sex', 'Pclass', 'Isalone']
cat_transformer = Pipeline(steps =[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

In [11]:
cat_transformer

In [12]:
cat_transformer.fit_transform(df[['Embarked']])

<891x3 sparse matrix of type '<class 'numpy.float64'>'
	with 891 stored elements in Compressed Sparse Row format>

In [13]:
pd.DataFrame((cat_transformer.fit_transform(df[['Embarked']])).toarray())

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
...,...,...,...
886,0.0,0.0,1.0
887,0.0,0.0,1.0
888,0.0,0.0,1.0
889,1.0,0.0,0.0


In [14]:
num_cols = ['Age', 'Fare', 'Family_size']
num_transformer = Pipeline(steps = [('imputer', KNNImputer(n_neighbors= 5)), ('scaler', RobustScaler())])

In [15]:
num_transformer

In [16]:
preproccesor = ColumnTransformer(transformers= 
[ ('num', num_transformer, num_cols), 
  ('cat', cat_transformer, cat_cols) ])

In [17]:
preproccesor

In [18]:
clf = Pipeline(steps = [('preproccesor', preproccesor), ('classifier', DecisionTreeClassifier())])

In [19]:
clf

In [23]:
clf.fit(X_train, y_train)

ValueError: A given column is not a column of the dataframe