In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")
test2 = test

train_res = train["Survived"].reset_index(drop = True)

In [None]:
train

In [None]:
#Clean Training Data

#Fill Na values

train['Age'].fillna(train['Age'].median(), inplace=True)

#train.loc[train["Survived"] == 1, "Age"] = train[train["Survived"] == 1]["Age"].fillna(train[train["Survived"] == 1]["Age"].mean())
#train.loc[train["Survived"] == 0, "Age"] = train[train["Survived"] == 0]["Age"].fillna(train[train["Survived"] == 0]["Age"].mean())


train["Embarked"].fillna("S", inplace=True)
train['T_partner']=train["SibSp"]+train["Parch"]
train['Alone']=np.where(train['T_partner']>0, 0, 1)
#train['Single'] = train['T_partner'].map(lambda s: 1 if s == 1 else 0)
#train['SmallF'] = train['T_partner'].map(lambda s: 1 if  s == 2  else 0)
#train['MedF'] = train['T_partner'].map(lambda s: 1 if 3 <= s <= 4 else 0)
#train['LargeF'] = train['T_partner'].map(lambda s: 1 if s >= 5 else 0)

#Extract title from name and map
train['Words_Count'] = train['Name'].apply(lambda x: len(x.split()))
train['Title'] = train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
train['Title'] = train['Title'].map(title_mapping)
train['Title'] = train['Title'].fillna(0)

#extract and map cabin number
train['Cabin'] = train['Cabin'].fillna('U')
import re
# Extract first letter
train['Cabin'] = train['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
cabin_category = {'A':9, 'B':8, 'C':7, 'D':6, 'E':5, 'F':4, 'G':3, 'T':2, 'U':1}
# Mapping 'Cabin' to group
train['Cabin'] = train['Cabin'].map(cabin_category)

#convert nonnumerical data to numerical
g = pd.get_dummies(train['Sex'], drop_first = True)
e = pd.get_dummies(train['Embarked'], drop_first = True)
p = pd.get_dummies(train['Pclass'], drop_first = True)
train = pd.concat([g,train,e,p], axis =1)

#create minor column for age as minors have greatly increased chance of survival
train['is_minor']=np.where(train['Age']<=16,1,0)

#clean tickets
train['Ticket_type'] = train['Ticket'].apply(lambda x: x[0:3])
train['Ticket_type'] = train['Ticket_type'].astype('category')
train['Ticket_type'] = train['Ticket_type'].cat.codes

#drop extra columns
train.drop('SibSp', axis=1, inplace=True)
train.drop('Parch', axis=1, inplace=True)
train.drop('T_partner', axis=1, inplace=True)
train.drop(["PassengerId", "Name", "Ticket",'Pclass','Sex','Embarked', "Survived"], axis = 1, inplace = True)

In [None]:
#clean test data
test["Age"].fillna(28, inplace=True)
test["Embarked"].fillna("S", inplace=True)
test["Fare"].fillna(14.45, inplace=True)
test['T_partner']=test["SibSp"]+test["Parch"]
test['Alone']=np.where(test['T_partner']>0, 0, 1)
#test['Single'] = test['T_partner'].map(lambda s: 1 if s == 1 else 0)
#test['SmallF'] = test['T_partner'].map(lambda s: 1 if  s == 2  else 0)
#test['MedF'] = test['T_partner'].map(lambda s: 1 if 3 <= s <= 4 else 0)
#test['LargeF'] = test['T_partner'].map(lambda s: 1 if s >= 5 else 0)



test['Words_Count'] = test['Name'].apply(lambda x: len(x.split()))
test['Title'] = test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
test['Title'] = test['Title'].map(title_mapping)
test['Title'] = test['Title'].fillna(0)

test['Cabin'] = test['Cabin'].fillna('U')
import re
# Extract first letter
test['Cabin'] = test['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
cabin_category = {'A':9, 'B':8, 'C':7, 'D':6, 'E':5, 'F':4, 'G':3, 'T':2, 'U':1}
# Mapping 'Cabin' to group
test['Cabin'] = test['Cabin'].map(cabin_category)


g = pd.get_dummies(test['Sex'], drop_first = True)
e = pd.get_dummies(test['Embarked'], drop_first = True)
p = pd.get_dummies(test['Pclass'], drop_first = True)
test = pd.concat([g,test,e,p], axis =1)

test['is_minor']=np.where(test['Age']<=16,1,0)

test['Ticket_type'] = test['Ticket'].apply(lambda x: x[0:3])
test['Ticket_type'] = test['Ticket_type'].astype('category')
test['Ticket_type'] = test['Ticket_type'].cat.codes



test.drop('SibSp', axis=1, inplace=True)
test.drop('Parch', axis=1, inplace=True)
test.drop('T_partner', axis=1, inplace=True)
test.drop(["PassengerId", "Name", "Ticket",'Pclass','Sex','Embarked'], axis = 1, inplace = True)


In [None]:
from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y = train_test_split(train, train_res, random_state = 0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=200)
random_forest.fit(train_x, train_y)

rand_pred = random_forest.predict(val_x)
print("Mean Absolute Error: " + str(1 - mean_absolute_error(rand_pred, val_y)))

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(train_x, train_y)
pred6 = decision_tree.predict(val_x)
print("Mean Absolute Error: " + str(1 - mean_absolute_error(pred6, val_y)))

In [None]:
#pred = random_forest.predict(test)
#output = pd.DataFrame({"PassengerId" : test2.PassengerId, "Survived" : pred})
#output.to_csv("submission.csv", index = False)
#output

In [None]:
#ensemble
xgb_clf = XGBClassifier()
xgb_clf.fit(train_x, train_y,eval_metric=["auc", "logloss"],verbose=True)
xgb_val_prob = xgb_clf.predict_proba(val_x)

from lightgbm import LGBMClassifier
import lightgbm as lgb

lgb_clf = LGBMClassifier()
lgb_clf.fit(train_x, train_y)
lgb_val_prob = lgb_clf.predict_proba(val_x)


from catboost import CatBoostClassifier
import catboost as catboost

cat_clf = CatBoostClassifier(verbose=0)
cat_clf.fit(train_x, train_y)
cat_val_prob = cat_clf.predict_proba(val_x)

In [None]:
for x in range(11):          # x ranges from 0 to 10
    for y in range(11):      # y ranges from 0 to 10
        z = 10 - x - y       # z is calculated to make the sum x + y + z equal to 10
        if 0 <= z <= 10:
            ens_prob = 0.1*x*cat_val_prob + 0.1*y*lgb_val_prob + 0.1*z*xgb_val_prob

            ens_prob1 = (ens_prob[:, 1] >= 0.4).astype(int)  # Assuming a binary classification and threshold of 0.5

            # Calculate the Mean Absolute Error (MAE)
            print("x:"+str(x)+" y:"+str(y)+" z:"+str(z)+"   Mean Absolute Error: " + str(1 - mean_absolute_error(val_y, ens_prob1)))

In [None]:
lgb_prob = lgb_clf.predict_proba(test)
lgb_prob = pd.DataFrame(lgb_prob)[1]

xgb_prob = xgb_clf.predict_proba(test)
xgb_prob = pd.DataFrame(xgb_prob)[1]

cat_prob = cat_clf.predict_proba(test)
cat_prob = pd.DataFrame(cat_prob)[1]

In [None]:
def make_submission_file(filename, probab, test_id, IdCol, targetCol, threshold=None):
    submit = pd.DataFrame()
    submit[IdCol] = test_id
    submit[targetCol] = probab
    if threshold!=None:
        pred = [1 if x>=threshold else 0 for x in probab]
        submit[targetCol] = pred
    submit.to_csv(filename, index=False)
    return submit

In [None]:
ens_prob = 0.3*cat_prob + 0.5*lgb_prob + 0.2*xgb_prob

ens_sub = make_submission_file("submission.csv", ens_prob, test2.PassengerId, 'PassengerId', 'Survived', threshold=0.5)
ens_sub