In [136]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline



In [137]:
df_train = pd.read_csv("titanic/train.csv", index_col="PassengerId")
x_test = pd.read_csv("titanic/test.csv", index_col="PassengerId").drop("Name", axis=1).drop("Ticket", axis=1).drop("Cabin", axis=1).drop("Fare", axis=1)
y_test = pd.read_csv("titanic/gender_submission.csv", index_col="PassengerId")

In [138]:
x_train = df_train.drop("Survived", axis=1).drop("Name", axis=1).drop("Ticket", axis=1).drop("Cabin", axis=1).drop("Fare", axis=1)
y_train = df_train[["Survived"]]

In [139]:
x_train.head(2)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,male,22.0,1,0,S
2,1,female,38.0,1,0,C


In [140]:
x_test.head(2)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,3,male,34.5,0,0,Q
893,3,female,47.0,1,0,S


In [141]:
numeric_features = ["Pclass", "Age", "SibSp", "Parch"]
categorical_features = ["Sex", "Embarked"]

In [142]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])
numeric_transformer

0,1,2
,steps,"[('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [143]:
categorical_transformer = Pipeline(steps=[
    ("encoder", OrdinalEncoder())
])
categorical_transformer


0,1,2
,steps,"[('encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [144]:
preprocessor = ColumnTransformer(
    transformers=[ 
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)
preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [145]:
random_forest_model = RandomForestClassifier()
random_forest_model

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [146]:
full_pipeline = Pipeline(
    steps=(
        ("preprocessor", preprocessor),
        ("classifier", random_forest_model)
    )
)

In [147]:
full_pipeline

0,1,2
,steps,"(('preprocessor', ...), ...)"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [148]:
full_pipeline.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [150]:
x_test.head(2)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,3,male,34.5,0,0,Q
893,3,female,47.0,1,0,S


In [152]:
list(x_test.iloc[0])

[np.int64(3), 'male', np.float64(34.5), np.int64(0), np.int64(0), 'Q']

In [160]:
full_pipeline.predict(x_test[0:1])

array([0])

In [161]:
full_pipeline.predict(x_test[0:2])

array([0, 0])