In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv("Titanic-Dataset.csv")

# Check for missing values
print(df.isnull().sum())

# Preprocess the data
df.drop(['PassengerId', 'Ticket'], axis=1, inplace=True)
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna('S', inplace=True)
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

# Extract titles from names
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
df['Title'] = le.fit_transform(df['Title'])

# Drop the Name column
df.drop('Name', axis=1, inplace=True)

# Feature engineering
df['FamilySize'] = df['SibSp'] + df['Parch']

# Drop Cabin column (too many unique values)
df.drop('Cabin', axis=1, inplace=True)

# Split the data into training and testing sets
X = df.drop(['Survived'], axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Evaluate the model
y_pred = log_reg.predict(X_test)
print("Accuracy (LR):", accuracy_score(y_test, y_pred))
print("Classification Report (LR):")
print(classification_report(y_test, y_pred))
print("Confusion Matrix (LR):")
print(confusion_matrix(y_test, y_pred))
print("AUC-ROC (LR):", roc_auc_score(y_test, y_pred))
print("Precision (LR):", precision_score(y_test, y_pred))
print("Recall (LR):", recall_score(y_test, y_pred))
print("F1-score (LR):", f1_score(y_test, y_pred))

# Try other models
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Accuracy (DT):", accuracy_score(y_test, y_pred_dt))

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Accuracy (RF):", accuracy_score(y_test, y_pred_rf))

svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("Accuracy (SVM):", accuracy_score(y_test, y_pred_svm))

# Cross-validation
scores_lr = cross_val_score(log_reg, X, y, cv=5)
scores_dt = cross_val_score(dt, X, y, cv=5)
scores_rf = cross_val_score(rf, X, y, cv=5)
scores_svm = cross_val_score(svm, X, y, cv=5)

print("Cross-validation scores (LR):", scores_lr)
print("Cross-validation scores (DT):", scores_dt)
print("Cross-validation scores (RF):", scores_rf)
print("Cross-validation scores (SVM):", scores_svm)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Accuracy (LR): 0.7988826815642458
Classification Report (LR):
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

Confusion Matrix (LR):
[[89 16]
 [20 54]]
AUC-ROC (LR): 0.7886743886743887
Precision (LR): 0.7714285714285715
Recall (LR): 0.7297297297297297
F1-score (LR): 0.75
Accuracy (DT): 0.7932960893854749
Accuracy (RF): 0.8324022346368715


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy (SVM): 0.7821229050279329


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores (LR): [0.7877095  0.7752809  0.78651685 0.75842697 0.8258427 ]
Cross-validation scores (DT): [0.78212291 0.7752809  0.8258427  0.73595506 0.80337079]
Cross-validation scores (RF): [0.7877095  0.79775281 0.86516854 0.7752809  0.80337079]
Cross-validation scores (SVM): [0.80446927 0.80898876 0.78651685 0.75280899 0.78651685]
