In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

from sklearn.impute import SimpleImputer

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Data Preparation

In [None]:
# load the data
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

train_df.head()

In [None]:
feature_cols = train_df.drop(['Survived', 'PassengerId', 'Name'], axis=1).columns
# split features with numerical and text values
num_cols = train_df[feature_cols].select_dtypes(include=['int64','float64']).columns
cat_cols = train_df[feature_cols].select_dtypes(exclude=['int64','float64']).columns

print(num_cols)

In [None]:
print(train_df.info())
print('The share of survived in the train data: ' + str(np.mean(train_df[target_col])))

There are missing values for Age, Ticket, Fare, Cabin(a lot of missing, but this variable doesn't seem to be useful), Embarked.
The share of the survived in the train data is not so different from 50%, the sample is about balanced.

## Basic Data Visualisation

In [None]:
for i, col in enumerate(num_cols):
    plt.figure(i)
    sns.histplot(data=train_df[col])

To be continued

## Encoding of Categorical Variables

In [None]:
#  make a dummy variable Male instead of Sex
train_df['Male'] = 0
train_df.loc[train_df['Sex'] == 'male', 'Male'] = 1

In [None]:
#  later I will try to use 3 dummies for Pclass instead of Pclass itself
Pclass_1hot = pd.get_dummies(train_df.Pclass, prefix='Pclass')

#. create dummies for 'Embarked' as well
print(train_df['Embarked'].unique())
Embarked_1hot = pd.get_dummies(train_df.Embarked, prefix='Embarked')

train_df = pd.concat([train_df, Pclass_1hot, Embarked_1hot], axis=1)

In [None]:
# drop variable 'Sex', it is not needed anymore
train_df = train_df.drop(['Sex', 'Embarked'], axis=1)

In [None]:
train_df.head(10)

## Baseline Model

### Logistic Regression (without penalty)

In [None]:
# separate features from a target variable
feature_cols = train_df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1).columns
target_col = 'Survived'

In [None]:
# drop all rows with nan values for now
train_df_wtNaN = train_df.dropna(axis=0)

# split the sample into the train and validation subsamples
y = train_df_wtNaN[target_col]
X = train_df_wtNaN[feature_cols]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [None]:
model_Logistic = LogisticRegression(penalty='none', max_iter=1000, random_state=0).fit(X_train, y_train)
model_Logistic.fit(X_train, y_train)
y_pred = model_Logistic.predict(X_valid)
score_model_Logistic = accuracy_score(y_valid, y_pred)
print(score_model_Logistic)

The achieved score is 75%.

## Other models
### Gaussian Naive Bayes

In [None]:
model_gnb = GaussianNB()
model_gnb.fit(X_train, y_train)
y_pred = model_gnb.predict(X_valid)
score_gnb = accuracy_score(y_valid, y_pred)
print(score_gnb)

The resulting score is 72.4%

### DecisionTreeClassifier

In [None]:
model_tree = tree.DecisionTreeClassifier(max_depth=10, min_samples_leaf=5, random_state=0)
model_tree.fit(X_train, y_train)
y_pred = model_tree.predict(X_valid)
score_tree = accuracy_score(y_valid, y_pred)
print(score_tree)

The achieved score is 73.5% (small improvement!). Let't try with more trees then.

### RandomForestClassifier

In [None]:
model_RF = RandomForestClassifier(max_depth=10, min_samples_leaf=5, random_state=0)
model_RF.fit(X_train, y_train)
y_pred = model_RF.predict(X_valid)
score_RF = accuracy_score(y_valid, y_pred)
print(score_RF)

Ok, now we have 75%, small improvement again, but the baseline is still the best.

### KNeighborsClassifier

In [None]:
model_kNN = KNeighborsClassifier(n_neighbors=12)
model_kNN.fit(X_train, y_train)
y_pred = model_kNN.predict(X_valid)
score_kNN = accuracy_score(y_valid, y_pred)
print(score_kNN)

The accuracy is only 68%.

### Support Vector Classification

In [None]:
model_SVM = make_pipeline(StandardScaler(), SVC(gamma='auto'))
model_SVM.fit(X_train, y_train)
y_pred = model_SVM.predict(X_valid)
score_SVM = accuracy_score(y_valid, y_pred)
print(score_SVM)

Results in accuracy 74.8%.

Another implementation of SVM: faster, but less accurate (accuracy is only 71.6%):

In [None]:
mode_SVM_SGD = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3))
mode_SVM_SGD.fit(X_train, y_train)
y_pred = mode_SVM_SGD.predict(X_valid)
score_SVM_SGD = accuracy_score(y_valid, y_pred)
print(score_SVM_SGD)

### XGBClassifier

In [None]:
model_XGB = XGBClassifier()
model_XGB.fit(X_train, y_train)
y_pred = model_XGB.predict(X_valid)
score_XGB = accuracy_score(y_valid, y_pred)
print(score_XGB)

The achieved accuracy is 74.4%.

Nothing outperforms the simplest baseline model so far, but I have to work on hyperparameters for all models that I tried.
To be continued.

## Final model and submission

In [None]:
final_model = model_Logistic
final_model.fit(X, y)

In [None]:
test_df['Male'] = 0
test_df.loc[test_df['Sex'] == 'male', 'Male'] = 1

Pclass_1hot_test = pd.get_dummies(test_df.Pclass, prefix='Pclass')

#. create dummies for 'Embarked' as well
print(test_df['Embarked'].unique())
Embarked_1hot_test = pd.get_dummies(test_df.Embarked, prefix='Embarked')

test_df = pd.concat([test_df, Pclass_1hot_test, Embarked_1hot_test], axis=1)
test_df = test_df.drop(['Sex', 'Embarked'], axis=1)



In [None]:
feature_cols_test = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1).columns
X_test = test_df[feature_cols_test]

my_imputer = SimpleImputer()
imp_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))
imp_X_test.columns = X_test.columns

preds_test = final_model.predict(imp_X_test)

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')
submission['Survived'] = preds_test
submission.to_csv('submission.csv', index = False)