In [1]:
import pandas as pd
# import zipfile
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# with zipfile.ZipFile('C:/Users/shree/Desktop/Kaggle/titanic.zip', 'r') as zip_ref:
#     zip_ref.extractall('titanic')

In [3]:
import os
dataframes = {}
for filename in os.listdir('titanic'):
    if filename.endswith('.csv'):
        filepath = os.path.join('titanic', filename)
        dataframes[filename] = pd.read_csv(filepath)

In [4]:
train = dataframes['train.csv']
test = dataframes['test.csv']

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [7]:
train.groupby('Survived')['PassengerId'].count()

Survived
0    549
1    342
Name: PassengerId, dtype: int64

In [8]:
print((sum(train["Survived"])/len(train["Survived"])*100))

38.38383838383838


In [9]:
train.groupby(['Sex','Survived'])['PassengerId'].count()

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: PassengerId, dtype: int64

In [10]:
train.groupby('Ticket')['PassengerId'].count()

Ticket
110152         3
110413         3
110465         2
110564         1
110813         1
              ..
W./C. 6608     4
W./C. 6609     1
W.E.P. 5734    1
W/C 14208      1
WE/P 5735      2
Name: PassengerId, Length: 681, dtype: int64

In [11]:
train['Fare'].fillna(train['Fare'].median(), inplace=True)
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)
train['Cabin'].fillna('X', inplace=True)
train['Cabin'] = train['Cabin'].str[0].replace("n", "U")
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
train["IsAlone"] = (train["FamilySize"] == 1).astype(int)
train['Title'] = train['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
train['Title'] = train['Title'].replace(['Mlle', 'Ms', 'Lady', 'Countess', 'Mme', 'Dona'], 'Miss')
train['Title'] = train['Title'].replace(['Rev', 'Don', 'Capt', 'Dr', 'Major', 'Sir', 'Col', 'Jonkheer'], 'Rare')
train['Age'] = train.groupby(["Title","Pclass"])['Age'].transform(lambda x: x.fillna(x.median()))
train['Is_Child'] = (train['Title'].isin(['Master', 'Miss'])) & (train['Age'] < 18)

In [12]:
test['Fare'].fillna(test['Fare'].median(), inplace=True)
test['Embarked'].fillna(test['Embarked'].mode()[0], inplace=True)
test['Cabin'].fillna('X', inplace=True)
test['Cabin'] = test['Cabin'].str[0].replace("n", "U")
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1
test["IsAlone"] = (test["FamilySize"] == 1).astype(int)
test['Title'] = test['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Title'].replace(['Mlle', 'Ms', 'Lady', 'Countess', 'Mme', 'Dona'], 'Miss')
test['Title'] = test['Title'].replace(['Rev', 'Don', 'Capt', 'Dr', 'Major', 'Sir', 'Col', 'Jonkheer'], 'Rare')
test['Age'] = test.groupby(["Title","Pclass"])['Age'].transform(lambda x: x.fillna(x.median()))
test['Is_Child'] = (test['Title'].isin(['Master', 'Miss'])) & (test['Age'] < 18)

In [13]:
X = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Title', 'Is_Child', 'FamilySize', 'IsAlone']
Y = 'Survived'

In [14]:
X0_train = train[X]
Y0_train = train[Y]

X1_test = test[X]
len(X0_train), len(Y0_train)

(891, 891)

In [15]:
# X0_train = X0_train[['Pclass', 'Sex', 'Age_buckets', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Title', 'Is_Child']]
# X1_test = X1_test[['Pclass', 'Sex', 'Age_buckets', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Title', 'Is_Child']]

X0_train = X0_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Title', 'Is_Child', 'FamilySize', 'IsAlone']]
X1_test = X1_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Title', 'Is_Child', 'FamilySize', 'IsAlone']]

len(X0_train), len(X1_test)

(891, 418)

In [16]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoder.fit(X0_train[['Sex', 'Cabin', 'Embarked', 'Title']])

# Fit and transform the categorical columns
one_hot_encoded = encoder.transform(X0_train[['Sex', 'Cabin', 'Embarked', 'Title']])

one_hot_encoded_X1_test = encoder.transform(X1_test[['Sex', 'Cabin', 'Embarked', 'Title']])

# Create a DataFrame with the encoded columns
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())

one_hot_df_X1_test = pd.DataFrame(one_hot_encoded_X1_test, columns=encoder.get_feature_names_out())

# Concatenate the one-hot encoded columns with the original DataFrame
X0_train_encoded = pd.concat([X0_train.drop(['Sex', 'Cabin', 'Embarked', 'Title'], axis=1).reset_index(drop=True), one_hot_df.reset_index(drop=True)], axis=1)
X1_test_encoded = pd.concat([X1_test.drop(['Sex', 'Cabin', 'Embarked', 'Title'], axis=1).reset_index(drop=True), one_hot_df_X1_test.reset_index(drop=True)], axis=1)

In [17]:
print(one_hot_encoded.shape)
print(len(encoder.get_feature_names_out()))

(891, 19)
19


In [18]:
len(X0_train_encoded), len(X1_test_encoded)

(891, 418)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X0_train_encoded, Y0_train, test_size=0.20, random_state=12345)

In [20]:
model0 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

In [21]:
model0.fit(X_train, y_train)

# make predictions for test data
y_pred = model0.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# make predictions for test data
y1_pred = model0.predict(X1_test_encoded)
predictions1 = [round(value) for value in y1_pred]

y_pred_proba = model0.predict_proba(X_test)[::,1]
print("Valid",roc_auc_score(y_test,  y_pred_proba))

Accuracy: 81.01%
Valid 0.8472329472329473


In [22]:
model1 = CatBoostClassifier(verbose=0)

In [28]:
model1.fit(X_train, y_train)

# make predictions for test data
y_pred = model1.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# make predictions for test data
y1_pred = model1.predict(X1_test_encoded)
predictions1 = [round(value) for value in y1_pred]

# Calculate feature importance
feature_importances = model1.get_feature_importance(Pool(X_train, label = y_train))
feature_names = X_train.columns

# Feature importance visualization
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Ordered by feature importance
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

Accuracy: 80.45%


In [24]:
param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200]
}

In [25]:
folds = 5
param_comb = 1000
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

In [26]:
grid = GridSearchCV(estimator=model1, param_grid=param_grid, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train, y_train), verbose=3 )
grid.fit(X_train, y_train)
print('\n All results:')
print(grid.cv_results_)
print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_ * 2 - 1)
print('\n Best parameters:')
print(grid.best_params_)
results = pd.DataFrame(grid.cv_results_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits

 All results:
{'mean_fit_time': array([0.22581415, 0.09981103, 0.10105505, 0.1784317 , 0.17909002,
       0.18311567, 0.14266725, 0.14824605, 0.14683485, 0.25794291,
       0.26098604, 0.26483531, 0.23887868, 0.26506777, 0.25743461,
       0.48057399, 0.52018733, 0.52124805]), 'std_fit_time': array([0.06349005, 0.00625912, 0.00364487, 0.00499654, 0.00353631,
       0.00372998, 0.00592451, 0.00370319, 0.00430498, 0.00197092,
       0.00840887, 0.00547405, 0.00674122, 0.00447254, 0.00740108,
       0.01368157, 0.01520493, 0.01122116]), 'mean_score_time': array([0.00344534, 0.00351715, 0.00327768, 0.00314312, 0.00295   ,
       0.00279393, 0.00297594, 0.00294762, 0.00311146, 0.0031951 ,
       0.00316572, 0.00310359, 0.00316262, 0.00293279, 0.00334501,
       0.00293436, 0.0036202 , 0.00310736]), 'std_score_time': array([0.00077501, 0.00040489, 0.00034121, 0.00050561, 0.0004858 ,
       0.00048941, 0.00048421, 0.00066369, 0.000

In [23]:
model1_1 = CatBoostClassifier(depth=4, iterations=200, learning_rate=0.05, verbose=0)

In [24]:
model1_1.fit(X_train, y_train)

# make predictions for test data
y_pred = model1_1.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# make predictions for test data
y1_pred = model1_1.predict(X1_test_encoded)
predictions1 = [round(value) for value in y1_pred]

# Calculate feature importance
feature_importances = model1_1.get_feature_importance(Pool(X_train, label = y_train))
feature_names = X_train.columns

# Feature importance visualization
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Ordered by feature importance
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

Accuracy: 79.89%


In [25]:
print(feature_importance_df)

         Feature  Importance
24      Title_Mr   15.206488
1            Age   13.378852
0         Pclass   11.724882
4           Fare    9.692597
9       Sex_male    9.572254
8     Sex_female    8.809809
6     FamilySize    6.827408
2          SibSp    4.306505
18       Cabin_X    3.834336
22  Title_Master    2.319374
14       Cabin_E    2.292321
25     Title_Mrs    1.924685
21    Embarked_S    1.736390
3          Parch    1.480672
5       Is_Child    1.353554
13       Cabin_D    1.003656
19    Embarked_C    0.722495
12       Cabin_C    0.641809
7        IsAlone    0.598882
26    Title_Rare    0.563218
11       Cabin_B    0.516261
20    Embarked_Q    0.498731
23    Title_Miss    0.439665
10       Cabin_A    0.254237
15       Cabin_F    0.169066
16       Cabin_G    0.130336
17       Cabin_T    0.001518


In [26]:
# Ensemble Model
model2 = LogisticRegression(max_iter=200)

ensemble = VotingClassifier(estimators=[
    ("catboost", model1_1),
    ("rf", model0),
    ("lr", model2)
], voting="hard")

ensemble.fit(X_train, y_train)

# make predictions for test data
y_pred = ensemble.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# make predictions for test data
y1_pred = ensemble.predict(X1_test_encoded)
predictions1 = [round(value) for value in y1_pred]

Accuracy: 82.12%


In [29]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions1})
output.to_csv('submission_titanic_cat_v4.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
