In [None]:
import pandas as pd
import numpy as np
# import zipfile
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
# from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn import tree
# import lightgbm as lgb
# from keras import Sequential
# from keras.layers import Dense
from catboost import Pool, CatBoostClassifier
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
# with zipfile.ZipFile('titanic.zip', 'r') as zip_ref:
#     zip_ref.extractall('titanic')

In [None]:
import os
dataframes = {}
for filename in os.listdir('titanic'):
    if filename.endswith('.csv'):
        filepath = os.path.join('titanic', filename)
        dataframes[filename] = pd.read_csv(filepath)

In [None]:
train = dataframes['train.csv']
test = dataframes['test.csv']

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.groupby('Survived')['PassengerId'].count()

In [None]:
print((sum(train["Survived"])/len(train["Survived"])*100))

In [None]:
train.groupby(['Sex','Survived'])['PassengerId'].count()

In [None]:
train.groupby('Ticket')['PassengerId'].count()

In [None]:
train['Age'].fillna(train['Age'].median(), inplace=True)
train['Fare'].fillna(train['Fare'].median(), inplace=True)
train['Embarked'].fillna('X', inplace=True)
train['Cabin'].fillna('X', inplace=True)
train['Cabin'] = train['Cabin'].str[0]
train['Title'] = train['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
train['Title'] = train['Title'].replace(['Mlle', 'Ms', 'Lady', 'Countess', 'Mme', 'Dona'], 'Miss')
train['Title'] = train['Title'].replace(['Rev', 'Don', 'Capt', 'Major', 'Sir', 'Col', 'Jonkheer'], 'Rare')
train['Is_Child'] = (train['Title'].isin(['Master', 'Miss'])) & (train['Age'] < 18)

In [None]:
test['Age'].fillna(test['Age'].median(), inplace=True)
test['Fare'].fillna(test['Fare'].median(), inplace=True)
test['Embarked'].fillna('X', inplace=True)
test['Cabin'].fillna('X', inplace=True)
test['Cabin'] = test['Cabin'].str[0]
test['Title'] = test['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Title'].replace(['Mlle', 'Ms', 'Lady', 'Countess', 'Mme', 'Dona'], 'Miss')
test['Title'] = test['Title'].replace(['Rev', 'Don', 'Capt', 'Major', 'Sir', 'Col', 'Jonkheer'], 'Rare')
test['Is_Child'] = (test['Title'].isin(['Master', 'Miss'])) & (test['Age'] < 18)

In [None]:
X = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Is_Child']
Y = 'Survived'

In [None]:
X0_train = train[X]
Y0_train = train[Y]

X1_test = test[X]
len(X0_train), len(Y0_train)

In [None]:
Bucket = [0, 10, 18, 25, 30, 40, 60, 80, 100]
Label = ['0-10', '11-18', '19-25', 
         '26-30', '31-40', '41-60', '61-80', '>80']

X0_train['Age_buckets'] = pd.cut(X0_train['Age'],
                                      bins=Bucket, 
                                      labels=Label, 
                                      include_lowest= True)

X1_test['Age_buckets'] = pd.cut(test['Age'],
                                      bins=Bucket, 
                                      labels=Label, 
                                      include_lowest= True)

In [None]:
# X0_train = X0_train[['Pclass', 'Sex', 'Age_buckets', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Is_Child']]
# X1_test = X1_test[['Pclass', 'Sex', 'Age_buckets', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Is_Child']]

X0_train = X0_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Is_Child']]
X1_test = X1_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Is_Child']]

In [None]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoder.fit(X0_train[['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Is_Child']])

# Fit and transform the categorical columns
one_hot_encoded = encoder.transform(X0_train[['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Is_Child']])

one_hot_encoded_X1_test = encoder.transform(X1_test[['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Is_Child']])

# Create a DataFrame with the encoded columns
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())

one_hot_df_X1_test = pd.DataFrame(one_hot_encoded_X1_test, columns=encoder.get_feature_names_out())

# Concatenate the one-hot encoded columns with the original DataFrame
X0_train_encoded = pd.concat([X0_train.drop(['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Is_Child'], axis=1).reset_index(drop=True), one_hot_df.reset_index(drop=True)], axis=1)
X1_test_encoded = pd.concat([X1_test.drop(['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Is_Child'], axis=1).reset_index(drop=True), one_hot_df_X1_test.reset_index(drop=True)], axis=1)

In [None]:
print(one_hot_encoded.shape)
print(len(encoder.get_feature_names_out()))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X0_train_encoded, Y0_train, test_size=0.20, random_state=12345)

In [None]:
model0 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

In [None]:
model0.fit(X_train, y_train)

# make predictions for test data
y_pred = model0.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# make predictions for test data
y1_pred = model0.predict(X1_test_encoded)
predictions1 = [round(value) for value in y1_pred]

y_pred_proba = model0.predict_proba(X_test)[::,1]
print("Valid",roc_auc_score(y_test,  y_pred_proba))

In [None]:
model1 = XGBClassifier(
    alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.0,
              learning_rate=0.11, max_delta_step=0.5, max_depth=15,
              min_child_weight=2, missing=np.nan, n_estimators=300, n_jobs=1,
              objective='binary:logistic', random_state=0,
              reg_alpha=0.3, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=False, subsample=0.8, verbosity=1
)

In [None]:
params = {
        'min_child_weight': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'gamma': [0, 0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'iterations':[10,20,],
        'learning_rate':[0.1,0.01,0.001]
        }

In [None]:
folds = 5
param_comb = 1000
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

In [None]:
grid = GridSearchCV(estimator=model1, param_grid=params, scoring='roc_auc', n_jobs=4, cv=skf.split(X0_train_encoded, Y0_train), verbose=3 )
grid.fit(X0_train_encoded, Y0_train)
print('\n All results:')
print(grid.cv_results_)
print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_ * 2 - 1)
print('\n Best parameters:')
print(grid.best_params_)
results = pd.DataFrame(grid.cv_results_)

In [None]:
model1.fit(X_train, y_train)

# make predictions for test data
y_pred = model1.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# make predictions for test data
# y1_pred = model1.predict(X1_test_encoded)
# predictions1 = [round(value) for value in y1_pred]

y1_pred = grid.best_estimator_.predict_proba(X1_test_encoded)
results_df = pd.DataFrame(data={'PassengerId':test['PassengerId'], 'Survived':y1_pred[:,1]})

from sklearn.metrics import roc_auc_score
y_pred_proba = model1.predict_proba(X_test)[::,1]
print("Valid",roc_auc_score(y_test,  y_pred_proba))

In [None]:
model2 = AdaBoostClassifier(estimator = tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0,
                       random_state=None, splitter='best' ))

In [None]:
model2.fit(X_train, y_train)

# make predictions for test data
y_pred = model2.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# make predictions for test data
y1_pred = model2.predict(X1_test_encoded)
predictions1 = [round(value) for value in y1_pred]

from sklearn.metrics import roc_auc_score
y_pred_proba = model2.predict_proba(X_test)[::,1]
print("Valid",roc_auc_score(y_test,  y_pred_proba))

In [None]:
model3 = tree.DecisionTreeClassifier()

In [None]:
model3.fit(X_train, y_train)

# make predictions for test data
y_pred = model3.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# make predictions for test data
y1_pred = model3.predict(X1_test_encoded)
predictions1 = [round(value) for value in y1_pred]

from sklearn.metrics import roc_auc_score
y_pred_proba = model3.predict_proba(X_test)[::,1]
print("Valid",roc_auc_score(y_test,  y_pred_proba))

In [None]:
model4 = lgb.LGBMClassifier(colsample_bytree = 0.952164731370897, min_child_samples = 111, min_child_weight = 0.01, num_leaves = 38, reg_alpha = 0, reg_lambda = 0.1, subsample = 0.3029313662262354)

In [None]:
model4.fit(X_train, y_train)

# make predictions for test data
y_pred = model4.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# make predictions for test data
y1_pred = model4.predict(X1_test_encoded)
predictions1 = [round(value) for value in y1_pred]

from sklearn.metrics import roc_auc_score
y_pred_proba = model4.predict_proba(X_test)[::,1]
print("Valid",roc_auc_score(y_test,  y_pred_proba))

In [None]:
model5 = Sequential()
#First Hidden Layer
model5.add(Dense(729, activation='tanh', kernel_initializer='random_normal', input_dim=729))
#Second  Hidden Layer
model5.add(Dense(360, activation='tanh', kernel_initializer='random_normal'))
#Third  Hidden Layer
model5.add(Dense(120, activation='tanh', kernel_initializer='random_normal'))
#Fourth  Hidden Layer
model5.add(Dense(60, activation='tanh', kernel_initializer='random_normal'))
#Output Layer
model5.add(Dense(14, activation='sigmoid', kernel_initializer='random_normal'))
#Output Layer
model5.add(Dense(1, activation='sigmoid', kernel_initializer='random_normal'))
#Compiling the neural network
model5.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

#Fitting the data to the training dataset
model5.fit(X_train,y_train, batch_size=50, epochs=25)

eval_model=model5.evaluate(X_train, y_train)
# eval_model

y_pred =model5.predict(X_test)
y_pred =(y_pred>0.7)

from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(y_test, y_pred)
print(cm1)

In [None]:
# make predictions for test data
y1_pred = model5.predict(X1_test_encoded)
predictions1 = [round(value[0]) for value in y1_pred.tolist()]

In [None]:
model6 = CatBoostClassifier(cat_features=['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Is_Child'], verbose=0)

In [None]:
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X0_train, Y0_train, test_size=0.20, random_state=12345)

In [None]:
model6.fit(X_train_cat, y_train_cat, cat_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Is_Child'])

# make predictions for test data
y_pred = model6.predict(X_test_cat)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test_cat, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# make predictions for test data
y1_pred = model6.predict(X1_test)
predictions1 = [round(value) for value in y1_pred]

# Calculate feature importance
feature_importances = model6.get_feature_importance(Pool(X_train_cat, label = y_train_cat, cat_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Is_Child']))
feature_names = X_train_cat.columns

# Feature importance visualization
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})
# Ordered by feature importance
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

In [None]:
param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200]
}

In [None]:
folds = 5
param_comb = 1000
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

In [None]:
grid = GridSearchCV(estimator=model6, param_grid=param_grid, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train_cat, y_train_cat), verbose=3 )
grid.fit(X_train_cat, y_train_cat)
print('\n All results:')
print(grid.cv_results_)
print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_ * 2 - 1)
print('\n Best parameters:')
print(grid.best_params_)
results = pd.DataFrame(grid.cv_results_)

In [None]:
model6_1 = CatBoostClassifier(depth=4, iterations=200, learning_rate=0.1, verbose=0)

In [None]:
model6_1.fit(X_train_cat, y_train_cat, cat_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Is_Child'])

# make predictions for test data
y_pred = model6_1.predict(X_test_cat)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test_cat, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# make predictions for test data
y1_pred = model6_1.predict(X1_test)
predictions1 = [round(value) for value in y1_pred]

# Calculate feature importance
feature_importances = model6_1.get_feature_importance(Pool(X_train_cat, label = y_train_cat, cat_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Is_Child']))
feature_names = X_train_cat.columns

# Feature importance visualization
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})
# Ordered by feature importance
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

In [None]:
print(feature_importance_df)

In [None]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions1})
output.to_csv('submission_titanic_cat_v3.csv', index=False)
print("Your submission was successfully saved!")