In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.layers import Dense, Input, BatchNormalization
from tensorflow.keras.models import Sequential
import keras
from keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier
from xgboost import XGBClassifier
import pickle
import flask
from flask import request

In [21]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Filling NaN

In [51]:
train_df.isna().sum().sort_values()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
Age            177
Cabin          687
dtype: int64

In [22]:
train_df = train_df.drop(columns=['Cabin', 'Ticket', 'Name'])
test_df = test_df.drop(columns=['Cabin', 'Ticket', 'Name'])

In [7]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [23]:
x = train_df.drop(columns=['PassengerId','Survived'])
y = train_df['Survived']

In [9]:
y.shape

(891,)

In [24]:
ohe_columns = ['Sex','Embarked']
num_columns = train_df.select_dtypes(exclude='object').columns
num_columns = num_columns.drop(['PassengerId','Survived'])

In [11]:
num_columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [25]:
ohe_pipeline = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])

In [26]:
num_pipeline = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [27]:
cols_trans = ColumnTransformer(transformers=[
    ('ohe',ohe_pipeline, ohe_columns),
    ('nums', num_pipeline, num_columns)
], remainder='passthrough',n_jobs=-1)

In [30]:
pipeline = Pipeline(steps=[
    ('preprocessing', cols_trans)
])

In [31]:
x_preprocessed = pipeline.fit_transform(x)

In [32]:
x_preprocessed.shape

(891, 10)

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x_preprocessed, y, test_size=0.2, random_state=23)

In [18]:
logreg = LogisticRegression()

In [19]:
logreg.fit(x_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [20]:
y_pred = logreg.predict(x_test)
accuracy_score(y_pred=y_pred, y_true=y_test)

0.7486033519553073

## NN

In [34]:
NN_model = Sequential([
    Input((x_preprocessed.shape[1],)),
    Dense(units=32, activation='relu'),
    BatchNormalization(),
    Dense(units=16, activation='relu'),
    BatchNormalization(),
    Dense(units=1, activation='sigmoid')
])
NN_model.compile(optimizer=Adam(learning_rate=0.001), loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])


In [35]:
NN_model.fit(x_preprocessed,y,epochs=20)

Epoch 1/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6384 - loss: 0.7235
Epoch 2/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7632 - loss: 0.5409
Epoch 3/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8035 - loss: 0.4562
Epoch 4/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8038 - loss: 0.4281
Epoch 5/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8220 - loss: 0.4125
Epoch 6/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8284 - loss: 0.4131
Epoch 7/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8136 - loss: 0.4094
Epoch 8/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8233 - loss: 0.4159
Epoch 9/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x7f931c507e00>

In [25]:
test_x = test_df.drop(columns=['PassengerId'])
test_preprocessed = pipeline.fit_transform(test_x)

### XGBoost

In [26]:
xgb = XGBClassifier(n_estimators=500, learning_rate=0.01, random_state=92, verobose=1, early_stopping_rounds=50)
xgb.fit(x_train,y_train, eval_set=[(x_test,y_test)])

[0]	validation_0-logloss:0.65021
[1]	validation_0-logloss:0.64622
[2]	validation_0-logloss:0.64232
[3]	validation_0-logloss:0.63850
[4]	validation_0-logloss:0.63477
[5]	validation_0-logloss:0.63112
[6]	validation_0-logloss:0.62754
[7]	validation_0-logloss:0.62413
[8]	validation_0-logloss:0.62062
[9]	validation_0-logloss:0.61735
[10]	validation_0-logloss:0.61398
[11]	validation_0-logloss:0.61077
[12]	validation_0-logloss:0.60763
[13]	validation_0-logloss:0.60454
[14]	validation_0-logloss:0.60153
[15]	validation_0-logloss:0.59858
[16]	validation_0-logloss:0.59569
[17]	validation_0-logloss:0.59285
[18]	validation_0-logloss:0.59007
[19]	validation_0-logloss:0.58735
[20]	validation_0-logloss:0.58477


Parameters: { "verobose" } are not used.

  self.starting_round = model.num_boosted_rounds()


[21]	validation_0-logloss:0.58207
[22]	validation_0-logloss:0.57959
[23]	validation_0-logloss:0.57700
[24]	validation_0-logloss:0.57462
[25]	validation_0-logloss:0.57208
[26]	validation_0-logloss:0.56977
[27]	validation_0-logloss:0.56747
[28]	validation_0-logloss:0.56525
[29]	validation_0-logloss:0.56310
[30]	validation_0-logloss:0.56081
[31]	validation_0-logloss:0.55881
[32]	validation_0-logloss:0.55664
[33]	validation_0-logloss:0.55472
[34]	validation_0-logloss:0.55259
[35]	validation_0-logloss:0.55074
[36]	validation_0-logloss:0.54885
[37]	validation_0-logloss:0.54692
[38]	validation_0-logloss:0.54511
[39]	validation_0-logloss:0.54336
[40]	validation_0-logloss:0.54144
[41]	validation_0-logloss:0.53978
[42]	validation_0-logloss:0.53807
[43]	validation_0-logloss:0.53638
[44]	validation_0-logloss:0.53473
[45]	validation_0-logloss:0.53322
[46]	validation_0-logloss:0.53150
[47]	validation_0-logloss:0.53000
[48]	validation_0-logloss:0.52851
[49]	validation_0-logloss:0.52704
[50]	validatio

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,50
,enable_categorical,False


# Exporting

In [None]:
test_df['Survived'] = logreg.predict(test_preprocessed)

In [None]:
test_df['Survived'] = NN_model.predict(test_preprocessed)
test_df['Survived'] = (test_df['Survived']>0.5).astype(int)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [19]:
test_df['Survived'] = xgb.predict(test_preprocessed)

In [20]:
test_df[['PassengerId','Survived']].to_csv('xgboost.csv', index=False)

# Model Saving

In [42]:
sample = train_df.iloc[[1]].copy()
print(sample)
sample.drop(columns=['PassengerId','Survived'],inplace=True)
print(sample)

   PassengerId  Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
1            2         1       1  female  38.0      1      0  71.2833        C
   Pclass     Sex   Age  SibSp  Parch     Fare Embarked
1       1  female  38.0      1      0  71.2833        C


### Logistic Regression pipeline

In [29]:
logreg_pipeline = Pipeline(steps=[
    ('preprocessing',cols_trans),
    ('logistic regression',LogisticRegression())
])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=23)
logreg_pipeline.fit(x_train,y_train)

0,1,2
,steps,"[('preprocessing', ...), ('logistic regression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ohe', ...), ('nums', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [43]:
logreg_pipeline.predict(sample)

array([1])

In [None]:
with open('api/LogReg.pkl','wb') as file:
    pickle.dump(logreg_pipeline,file)

---

### NN Pipeline

In [40]:
model = Sequential([
    Input((10,)),
    Dense(units=32,activation='relu'),
    BatchNormalization(),
    Dense(units=16,activation='relu'),
    BatchNormalization(),
    Dense(units=1,activation='sigmoid')
])
model.compile(optimizer=Adam(learning_rate=0.001),loss=keras.losses.BinaryCrossentropy(),metrics=['accuracy'])
NN_model = KerasClassifier(model=model,epochs=20)
NN_pipeline = Pipeline([
    ('preprocessing',cols_trans),
    ('NN model',NN_model)
])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=23)
NN_pipeline.fit(x_train,y_train)

Epoch 1/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.4958 - loss: 0.8780
Epoch 2/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7167 - loss: 0.5834
Epoch 3/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7549 - loss: 0.5250
Epoch 4/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7643 - loss: 0.5059
Epoch 5/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8002 - loss: 0.4528
Epoch 6/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7737 - loss: 0.4550
Epoch 7/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7771 - loss: 0.4727
Epoch 8/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8078 - loss: 0.4544
Epoch 9/20
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

0,1,2
,steps,"[('preprocessing', ...), ('NN model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ohe', ...), ('nums', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,model,"<Sequential n...8, built=True>"
,build_fn,
,warm_start,False
,random_state,
,optimizer,'rmsprop'
,loss,
,metrics,
,batch_size,
,validation_batch_size,
,verbose,1


In [44]:
NN_pipeline.predict(sample)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step


array([1])

In [None]:
with open('api/NN.pkl','wb') as file:
    pickle.dump(NN_pipeline,file)

---

### XGBClassifier Pipeline

In [50]:
xgb = XGBClassifier(n_estimators=500, learning_rate=0.01, random_state=92, verobose=1)
XGB_pipeline = Pipeline([
    ('preprocessing',cols_trans),
    ('XGB model',xgb)
])
XGB_pipeline.fit(x_train,y_train)

Parameters: { "verobose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,steps,"[('preprocessing', ...), ('XGB model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ohe', ...), ('nums', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [51]:
XGB_pipeline.predict(sample)

array([1])

In [None]:
with open('api/XGB.pkl','wb') as file:
    pickle.dump(XGB_pipeline,file)