In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [58]:
train_df = pd.read_csv("titanic/train.csv", index_col='PassengerId') 

In [59]:
# train
train_features_df = train_df.drop(axis=1, columns=['Survived'])
train_output_df = train_df[['Survived']]

# test
test_features_df = pd.read_csv("titanic/test.csv", index_col='PassengerId') 
test_output_df = pd.read_csv("titanic/gender_submission.csv", index_col='PassengerId') 

In [60]:

def drop_columns(df, columns_to_drop=["Name", "Ticket"]):
    return df.drop("Name", axis=1).drop("Ticket", axis=1)

def convert_cabin (df):
    df['Cabin'] = df["Cabin"].astype(np.str_).str[0]
    return df

def column_encoder(df, column_name=["Sex", "Embarked", "Cabin"]):
    for column in column_name:
        df[column] = LabelEncoder().fit_transform(df[column])
    return df
    



In [61]:
test_features_df.head(2)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [62]:
test_features_df = test_features_df.pipe(drop_columns).pipe(convert_cabin).pipe(column_encoder)
test_features_df.head(2)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,1,34.5,0,0,7.8292,7,1
893,3,0,47.0,1,0,7.0,7,2


In [63]:
train_features_df = train_features_df.pipe(drop_columns).pipe(convert_cabin).pipe(column_encoder)
train_features_df.head(20)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,1,22.0,1,0,7.25,8,2
2,1,0,38.0,1,0,71.2833,2,0
3,3,0,26.0,0,0,7.925,8,2
4,1,0,35.0,1,0,53.1,2,2
5,3,1,35.0,0,0,8.05,8,2
6,3,1,,0,0,8.4583,8,1
7,1,1,54.0,0,0,51.8625,4,2
8,3,1,2.0,3,1,21.075,8,2
9,3,0,27.0,0,2,11.1333,8,2
10,2,0,14.0,1,0,30.0708,8,0


In [64]:
standar_scaler = StandardScaler()
standar_scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [66]:
scaled_df =  pd.DataFrame(standar_scaler.fit_transform(train_features_df), columns=train_features_df.keys())
scaled_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0.827377,0.737695,-0.530377,0.432793,-0.473674,-0.502445,0.522067,0.581114
1,-1.566107,-1.355574,0.571831,0.432793,-0.473674,0.786845,-1.917594,-1.938460
2,0.827377,-1.355574,-0.254825,-0.474545,-0.473674,-0.488854,0.522067,0.581114
3,-1.566107,-1.355574,0.365167,0.432793,-0.473674,0.420730,-1.917594,0.581114
4,0.827377,0.737695,0.365167,-0.474545,-0.473674,-0.486337,0.522067,0.581114
...,...,...,...,...,...,...,...,...
886,-0.369365,0.737695,-0.185937,-0.474545,-0.473674,-0.386671,0.522067,0.581114
887,-1.566107,-1.355574,-0.737041,-0.474545,-0.473674,-0.044381,-2.324204,0.581114
888,0.827377,-1.355574,,0.432793,2.008933,-0.176263,0.522067,0.581114
889,-1.566107,0.737695,-0.254825,-0.474545,-0.473674,-0.044381,-1.917594,-1.938460


In [67]:
scaled_df.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0
mean,-8.772133e-17,-1.156327e-16,2.388379e-16,4.3860660000000004e-17,5.3829000000000005e-17,3.9873330000000004e-18,1.056643e-16,-3.5886e-17
std,1.000562,1.000562,1.000701,1.000562,1.000562,1.000562,1.000562,1.000562
min,-1.566107,-1.355574,-2.016979,-0.4745452,-0.4736736,-0.6484217,-2.730814,-1.93846
25%,-0.3693648,-1.355574,-0.6595416,-0.4745452,-0.4736736,-0.4891482,0.5220674,-0.6786732
50%,0.8273772,0.7376951,-0.1170488,-0.4745452,-0.4736736,-0.3573909,0.5220674,0.5811139
75%,0.8273772,0.7376951,0.571831,0.4327934,-0.4736736,-0.02424635,0.5220674,0.5811139
max,0.8273772,0.7376951,3.465126,6.784163,6.974147,9.667167,0.5220674,1.840901


In [70]:
from sklearn.pipeline import Pipeline


In [73]:
from sklearn.tree import DecisionTreeClassifier

In [78]:
standar_scaler = StandardScaler()
standar_scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [79]:
tree_model = DecisionTreeClassifier ()
tree_model

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


# 1. Scalare
    (standard_scaler)
# 2. Antrenare
    (tree_model)

In [80]:
pipeline_model = Pipeline(steps=(("scaler", standar_scaler), ("model", tree_model)))
pipeline_model

0,1,2
,steps,"(('scaler', ...), ...)"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [81]:
pipeline_model.fit(train_features_df, train_output_df)

0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [82]:
train_features_df.head(2)

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,1,22.0,1,0,7.25,8,2
2,1,0,38.0,1,0,71.2833,2,0


In [84]:
test_predictions_df = pipeline_model.predict(test_features_df)
test_predictions_df

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [85]:
pipeline_model.score(test_features_df, test_output_df)

0.7799043062200957

In [89]:
predict_vs_real =  pd.DataFrame({
    "real": test_output_df["Survived"],
    "predict" : test_predictions_df,
    "correct":  test_output_df["Survived"] == test_predictions_df
})
predict_vs_real

Unnamed: 0_level_0,real,predict,correct
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
892,0,0,True
893,1,0,False
894,0,1,False
895,0,1,False
896,1,0,False
...,...,...,...
1305,0,0,True
1306,1,1,True
1307,0,0,True
1308,0,0,True


In [90]:
predict_vs_real["correct"].value_counts(normalize=True) 

correct
True     0.779904
False    0.220096
Name: proportion, dtype: float64

In [91]:
type(pipeline_model)

sklearn.pipeline.Pipeline

In [92]:
import pickle

In [93]:
with open("titanic.pkl", "wb") as file_writer:
    pickle.dump(pipeline_model, file_writer)