In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, jaccard_score, log_loss, accuracy_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def read_file(name: str) -> pd.DataFrame:
    return pd.read_csv(f"./data/{name}.csv")

In [4]:
def convert(df: pd.DataFrame) -> pd.DataFrame:
    data = df.copy()
    obj_cols = []
    for col in data.columns:
        if data[col].dtype == "object":
            obj_cols.append(col)

    data.drop(["Cabin"], axis = 1, inplace = True)
    
    sex_vals = np.zeros(data.shape[0])

    for i in range(data.shape[0]):
        if data["Sex"][i] == "male":
            sex_vals[i] = 1

    data["Sex"] = sex_vals

    E = dict(data["Embarked"].value_counts())
    EM = max(E)
    i = 1
    for e in E.keys():
        E[e] = i
        i += 1
    E_vals = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        E_vals[i] = E.get(data["Embarked"][i])
    for i in range(data.shape[0]):
        E_vals[i] = E.get(data["Embarked"][i], E.get(EM))
    data["Embarked"] = E_vals

    obj_cols.remove("Sex")
    obj_cols.remove("Embarked")
    obj_cols.remove("Cabin")

    data.drop(["Ticket"], axis = 1, inplace = True)

    data["Age"].fillna(data["Age"].median(), inplace = True)
    data["Fare"].fillna(data["Fare"].median(), inplace = True)
    return data.drop(["Survived"], axis = 1)

In [5]:
train = read_file("train")
test = read_file("test")

train.drop(["PassengerId", "Name"], axis = 1, inplace = True)
test.drop(["PassengerId", "Name"], axis = 1, inplace = True)

In [6]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,female,35.0,1,0,113803,53.1000,C123,S
4,0,3,male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,211536,13.0000,,S
887,1,1,female,19.0,0,0,112053,30.0000,B42,S
888,0,3,female,,1,2,W./C. 6607,23.4500,,S
889,1,1,male,26.0,0,0,111369,30.0000,C148,C


In [7]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,34.5,0,0,330911,7.8292,,Q
1,3,female,47.0,1,0,363272,7.0000,,S
2,2,male,62.0,0,0,240276,9.6875,,Q
3,3,male,27.0,0,0,315154,8.6625,,S
4,3,female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...
413,3,male,,0,0,A.5. 3236,8.0500,,S
414,1,female,39.0,0,0,PC 17758,108.9000,C105,C
415,3,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,3,male,,0,0,359309,8.0500,,S


In [8]:
yte = read_file("gender_submission").drop(["PassengerId"], axis = 1)
test = pd.concat([test, yte], axis = 1)
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,3,male,34.5,0,0,330911,7.8292,,Q,0
1,3,female,47.0,1,0,363272,7.0000,,S,1
2,2,male,62.0,0,0,240276,9.6875,,Q,0
3,3,male,27.0,0,0,315154,8.6625,,S,0
4,3,female,22.0,1,1,3101298,12.2875,,S,1
...,...,...,...,...,...,...,...,...,...,...
413,3,male,,0,0,A.5. 3236,8.0500,,S,0
414,1,female,39.0,0,0,PC 17758,108.9000,C105,C,1
415,3,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
416,3,male,,0,0,359309,8.0500,,S,0


In [9]:
yte

Unnamed: 0,Survived
0,0
1,1
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0


In [10]:
xtr = convert(train)
xte = convert(test)

ytr = train["Survived"]
yte = yte

In [11]:
lg_model = LogisticRegression()
lg_model.fit(xtr, ytr)

In [12]:
y_preds = lg_model.predict(xte)

In [13]:
print (accuracy_score(yte, y_preds))

0.9330143540669856


In [14]:
print (f"f1 score = {f1_score(yte, y_preds)}")
print (f"jaccard score = {jaccard_score(yte, y_preds)}")

f1 score = 0.9096774193548387
jaccard score = 0.834319526627219


In [15]:
yprob = lg_model.predict_proba(xte)

In [16]:
print (f"LogLoss = {log_loss(yte, yprob)}")

LogLoss = 0.28030452057312755


In [17]:
print (classification_report(yte, y_preds))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       266
           1       0.89      0.93      0.91       152

    accuracy                           0.93       418
   macro avg       0.93      0.93      0.93       418
weighted avg       0.93      0.93      0.93       418

