In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

def preprocess(df, median_age, median_fare):
    """
    Takes a dataframe and performs all the cleaning and feature engineering steps.
    """

    df_processed = df.copy()

    df_processed['Age'].fillna(median_age, inplace=True)
    df_processed['Fare'].fillna(median_fare, inplace=True)
    df_processed['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

    df_processed['FamilySize'] = df_processed['SibSp'] + df_processed['Parch'] + 1
    df_processed['IsAlone'] = 0
    df_processed.loc[df_processed['FamilySize'] == 1, 'IsAlone'] = 1

    df_processed['HasCabin'] = df_processed['Cabin'].notna().astype(int)

    df_processed['Title'] = df_processed['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df_processed['Title'] = df_processed['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df_processed['Title'] = df_processed['Title'].replace('Mlle', 'Miss')
    df_processed['Title'] = df_processed['Title'].replace('Ms', 'Miss')
    df_processed['Title'] = df_processed['Title'].replace('Mme', 'Mrs')
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    df_processed['Title'] = df_processed['Title'].map(title_mapping)
    df_processed['Title'].fillna(0, inplace=True) #

    age_bins = [0, 12, 18, 60, 100]
    age_labels = ['Child', 'Teen', 'Adult', 'Senior']
    df_processed['AgeGroup'] = pd.cut(df_processed['Age'], bins=age_bins, labels=age_labels)

    age_group_mapping = {'Child': 0, 'Teen': 1, 'Adult': 2, 'Senior': 3}
    df_processed['AgeGroup'] = df_processed['AgeGroup'].map(age_group_mapping)

    df_processed['Sex'] = df_processed['Sex'].map({'male': 0, 'female': 1})
    df_processed = pd.get_dummies(df_processed, columns=['Embarked'], drop_first=True)

    cols_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
    df_processed.drop(cols_to_drop, axis=1, inplace=True)

    return df_processed

df_train_raw = pd.read_csv('train.csv')


train_median_age = df_train_raw['Age'].median()
train_median_fare = df_train_raw['Fare'].median()

df_train_processed = preprocess(df_train_raw, train_median_age, train_median_fare)

print("--- Processed Training Data Head ---")
print(df_train_processed.head())

X = df_train_processed.drop('Survived', axis=1)
y = df_train_processed['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

gbr = GradientBoostingClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 4]
}

grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train_scaled, y_train)

print("\n--- Tuning Complete ---")
print("Best Parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}%".format(grid_search.best_score_ * 100))

best_model = grid_search.best_estimator_

y_pred_tuned = best_model.predict(X_test_scaled)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)

print(f"\nAccuracy on the test set with the tuned model: {accuracy_tuned * 100:.2f}%")

print("\nGenerating submission file for Kaggle...")

df_test_raw = pd.read_csv('test.csv')

passenger_ids = df_test_raw['PassengerId']

df_test_processed = preprocess(df_test_raw, train_median_age, train_median_fare)

X_train_cols = X.columns
df_test_processed = df_test_processed[X_train_cols]

X_test_final_scaled = scaler.transform(df_test_processed)

final_predictions = best_model.predict(X_test_final_scaled)

submission_df = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': final_predictions
})

submission_df.to_csv('final_submission.csv', index=False)

print("Submission file 'final_submission.csv' created successfully!")
print("You are now ready to submit this file to the Titanic competition on Kaggle.")


--- Processed Training Data Head ---
   Survived  Pclass  Sex   Age     Fare  FamilySize  IsAlone  HasCabin  Title  \
0         0       3    0  22.0   7.2500           2        0         0      1   
1         1       1    1  38.0  71.2833           2        0         1      3   
2         1       3    1  26.0   7.9250           1        1         0      2   
3         1       1    1  35.0  53.1000           2        0         1      3   
4         0       3    0  35.0   8.0500           1        1         0      1   

  AgeGroup  Embarked_Q  Embarked_S  
0        2       False        True  
1        2       False       False  
2        2       False        True  
3        2       False        True  
4        2       False        True  
Fitting 5 folds for each of 12 candidates, totalling 60 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Age'].fillna(median_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Fare'].fillna(median_fare, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 


--- Tuning Complete ---
Best Parameters found:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
Best cross-validation score: 83.43%

Accuracy on the test set with the tuned model: 83.80%

Generating submission file for Kaggle...
Submission file 'final_submission.csv' created successfully!
You are now ready to submit this file to the Titanic competition on Kaggle.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Age'].fillna(median_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Fare'].fillna(median_fare, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 