In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score,confusion_matrix, classification_report


import warnings
warnings.filterwarnings("ignore")

In [3]:
file_path_train = "/kaggle/input/titanic/train.csv"
file_path_test = "/kaggle/input/titanic/test.csv"

df = pd.read_csv(file_path_train)
df_test = pd.read_csv(file_path_test)

In [4]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
df = df.drop(columns=["Name", "Ticket", "Cabin"], axis=1)
df_test = df_test.drop(columns=["Name", "Ticket", "Cabin"], axis=1)

In [6]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
df["Sex"].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [8]:
df["Embarked"].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [9]:
# check for class imbalance 
df["Survived"].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [10]:
# encode categorical values

ctg_data_train= df[["Sex", 'Embarked']]
ctg_encoded_train = pd.get_dummies(ctg_data_train).astype(int)
df = pd.concat([df, ctg_encoded_train], axis=1)

In [11]:
ctg_data_test = df_test[['Sex', 'Embarked']]
ctg_encoded_test = pd.get_dummies(ctg_data_test).astype(int)
df_test = pd.concat([df_test, ctg_encoded_test], axis=1)

In [12]:
# impute missing values (train)

imputer = KNNImputer(n_neighbors=5)
X = df.drop(columns=["Embarked", "Sex"], axis=1)
X_imp = imputer.fit_transform(X)
X = pd.DataFrame(columns=X.columns, data=X_imp)

# adding imputed age values to the original dataset (df)

df = X.copy()

X = None

In [13]:
df_test

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,male,34.5,0,0,7.8292,Q,0,1,0,1,0
1,893,3,female,47.0,1,0,7.0000,S,1,0,0,0,1
2,894,2,male,62.0,0,0,9.6875,Q,0,1,0,1,0
3,895,3,male,27.0,0,0,8.6625,S,0,1,0,0,1
4,896,3,female,22.0,1,1,12.2875,S,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,male,,0,0,8.0500,S,0,1,0,0,1
414,1306,1,female,39.0,0,0,108.9000,C,1,0,1,0,0
415,1307,3,male,38.5,0,0,7.2500,S,0,1,0,0,1
416,1308,3,male,,0,0,8.0500,S,0,1,0,0,1


In [14]:
# impute missing values (test)

imputer = KNNImputer(n_neighbors=5)
Y = df_test.drop(columns=["Embarked", "Sex"], axis=1)
Y_imp = imputer.fit_transform(Y)
Y = pd.DataFrame(columns=Y.columns, data=Y_imp)

df_test = Y.copy()

Y = None

In [15]:
len(df)

891

In [16]:
len(df_test)

418

In [17]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1.0,0.0,3.0,22.0,1.0,0.0,7.25,0.0,1.0,0.0,0.0,1.0
1,2.0,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,1.0,0.0,0.0
2,3.0,1.0,3.0,26.0,0.0,0.0,7.925,1.0,0.0,0.0,0.0,1.0
3,4.0,1.0,1.0,35.0,1.0,0.0,53.1,1.0,0.0,0.0,0.0,1.0
4,5.0,0.0,3.0,35.0,0.0,0.0,8.05,0.0,1.0,0.0,0.0,1.0


In [18]:
df_test.head(5)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892.0,3.0,34.5,0.0,0.0,7.8292,0.0,1.0,0.0,1.0,0.0
1,893.0,3.0,47.0,1.0,0.0,7.0,1.0,0.0,0.0,0.0,1.0
2,894.0,2.0,62.0,0.0,0.0,9.6875,0.0,1.0,0.0,1.0,0.0
3,895.0,3.0,27.0,0.0,0.0,8.6625,0.0,1.0,0.0,0.0,1.0
4,896.0,3.0,22.0,1.0,1.0,12.2875,1.0,0.0,0.0,0.0,1.0


In [19]:
# normalizing the data for train and test

df.iloc[:, 1:] = MinMaxScaler().fit_transform(df.iloc[:, 1:])

df_test.iloc[:, 1:] = MinMaxScaler().fit_transform(df_test.iloc[:, 1:])

In [20]:
X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
lr_model = LogisticRegression()
lr_model.fit(X_train.drop("PassengerId", axis=1), y_train)

pred = lr_model.predict(X_test.drop("PassengerId", axis=1))
accuracy_score(y_test, pred)

0.7932960893854749

In [22]:
print(accuracy_score(y_test, pred))
print(precision_score(y_test, pred))
print(recall_score(y_test, pred))

0.7932960893854749
0.7605633802816901
0.7297297297297297


In [23]:
lgr_model =  LogisticRegression(solver='liblinear', max_iter=20000)

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.9, 0.95, 1, 1.5, 1.55, 2, 2.5, 2.55, 3, 3.5, 3.55, 4, 4.5, 4.55]
}

g_search = GridSearchCV(lgr_model, param_grid, cv=5, scoring="accuracy",n_jobs=-1)
g_search.fit(X_train.drop("PassengerId", axis=1), y_train)
g_search.best_params_

{'C': 4, 'penalty': 'l1'}

In [24]:
best_params = g_search.best_params_

lgr_best_model = g_search.best_estimator_

y_pred = lgr_best_model.predict(X_test.drop("PassengerId", axis=1))

In [25]:
print('Confusion Matrix: "\n"', confusion_matrix(y_test, y_pred))
print("\n")

print(f'Accuracy: {round(accuracy_score(y_test, y_pred), 3)} %')
print("\n")

print('Classification Report: \n', classification_report(y_test, y_pred))

Confusion Matrix: "
" [[90 15]
 [20 54]]


Accuracy: 0.804 %


Classification Report: 
               precision    recall  f1-score   support

         0.0       0.82      0.86      0.84       105
         1.0       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179



In [26]:
y_pred_final = lgr_best_model.predict(df_test.drop("PassengerId", axis=1))
df_test["Survived"] = y_pred_final