In [71]:
%matplotlib notebook
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
titanic = pd.read_csv("train.csv")



In [68]:

titanic = pd.read_csv("train.csv")
titanic['Title'] = titanic['Name'].str.extract(r',\s*([^\.]*)\.', expand=False).str.strip()
titanic['AgeGroup'] = pd.cut(titanic['Age'], bins=[0, 12, 18, 60, 100], labels=['Child', 'Teen', 'Adult', 'Senior'])
titanic.drop(columns=["Name", "Ticket", "PassengerId"], inplace=True)
X = titanic.drop("Survived", axis=1)
y = titanic["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_columns = ["Age"]
categorical_columns = ["Pclass", "Sex", "Cabin", "Embarked", "Title", "AgeGroup"]
numerical_transform = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transform = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transform, numerical_columns),
    ("cat", categorical_transform, categorical_columns)
])

models = {
    "LinearSVC": LinearSVC(max_iter=10000),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(),
    "LogisticRegression": LogisticRegression(max_iter=10000)
}
for model_name, model in models.items():
    print(f"\n=== {model_name} ===")
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    scores = cross_val_score(pipeline, X, y, cv=5, scoring="accuracy")
    print(f"Cross-Validation Accuracy: {scores.mean():.4f}")
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"Test Accuracy:  {acc}")
    print(f"Precision:      {prec}")
    print(f"Recall:         {rec}")
    print("Confusion Matrix:")
    print(cm)
  



=== LinearSVC ===
Cross-Validation Accuracy: 0.8013
Test Accuracy:  0.8044692737430168
Precision:      0.7671232876712328
Recall:         0.7567567567567568
Confusion Matrix:
[[88 17]
 [18 56]]

=== KNN ===
Cross-Validation Accuracy: 0.7991
Test Accuracy:  0.8100558659217877
Precision:      0.803030303030303
Recall:         0.7162162162162162
Confusion Matrix:
[[92 13]
 [21 53]]

=== SVC ===
Cross-Validation Accuracy: 0.6420
Test Accuracy:  0.5977653631284916
Precision:      0.625
Recall:         0.06756756756756757
Confusion Matrix:
[[102   3]
 [ 69   5]]

=== LogisticRegression ===
Cross-Validation Accuracy: 0.8048
Test Accuracy:  0.8268156424581006
Precision:      0.7945205479452054
Recall:         0.7837837837837838
Confusion Matrix:
[[90 15]
 [16 58]]


In [None]:

np.random.seed(42)
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', LogisticRegression(max_iter=1000))])

param_grid = {
    'classifier__C': np.logspace(-4, 4, 20),
    'classifier__solver': ['liblinear']
}

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    verbose=1
)

gs.fit(X_train, y_train)
gs.best_params_
gs.best_score_




Fitting 5 folds for each of 20 candidates, totalling 100 fits


Survived
0    549
1    342
Name: count, dtype: int64

In [84]:
titanic['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [None]:
test = pd.read_csv("test.csv")

test['Title'] = test['Name'].str.extract(r',\s*([^\.]*)\.', expand=False).str.strip()
test['AgeGroup'] = pd.cut(test['Age'], bins=[0, 12, 18, 60, 100], labels=['Child', 'Teen', 'Adult', 'Senior'])
test.drop(columns=["Name", "Ticket", "PassengerId"], inplace=True)

X_test = test

best_model = gs.best_estimator_
predictions = best_model.predict(X_test)

test['Predicted_Survived'] = predictions
test.to_csv("predictions.csv", index=False)

test['Predicted_Survived'].value_counts()

#the model predicts that around 40% of people survived which is close to precentage in data,which indicates that model is working fine

Predicted_Survived
0    253
1    165
Name: count, dtype: int64