# Import Libraries

In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

## Load Data

In [3]:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")

target = df_train["Survived"]     # Target 

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Remark: 'PassengerID' is unique and can be removed

## Explore data

In [4]:
df_train['Cabin'].value_counts()

Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64

In [11]:
df_train[["Cabin", "Survived"]].groupby(['Cabin']).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
C62 C64,1.0
D21,1.0
D17,1.0
D19,1.0
C148,1.0
...,...
C46,0.0
C30,0.0
C128,0.0
C124,0.0


In [None]:
print(df_train.isnull().sum() / len(df_train) * 100)

# Drop unwanted columns -  Need to revisit this step
df_train = df_train.drop(["Ticket", "Cabin", "PassengerId", "Survived"], axis = 1)
df_test = df_test.drop(["Ticket", "Cabin", "PassengerId"], axis = 1)

## Reasoning
#1. Features such as PassengerId and Name will be unique, so no point in keeping these in training data.
#2. 

In [None]:
le = LabelEncoder()

def df_clean(dataframe):
    # Convert categorical to numerical
    dataframe["Sex"] = le.fit_transform(dataframe["Sex"])
    dataframe["Embarked"] = le.fit_transform(dataframe["Embarked"])

    # Fill null/missing values
    dataframe["Age"] = dataframe["Age"].fillna(dataframe["Age"].mean())
    dataframe["Embarked"] = dataframe["Embarked"].fillna(dataframe["Embarked"].mean())
    dataframe["Fare"] = dataframe["Fare"].fillna(dataframe["Fare"].mean())

    return dataframe

df_train = df_clean(df_train)
df_test = df_clean(df_test)

# Observation:
- Scaling is required for features such as Age and fare

# Feature Engineering

In [None]:
# Combining 'parch' and 'sibsp' to create new feature named as 'family_size'

def feat_engg(dataframe):
    dataframe['family_size'] = dataframe['Parch'] + dataframe['SibSp']
    dataframe.drop(['Parch', 'SibSp'], axis=1, inplace=True)

    # Create a derived feature called 'is_alone' using the family_size feature
    dataframe['is_alone'] = 1
    #dataframe['is_alone'].loc[dataframe['family_size'] > 1] = 0
    dataframe.loc[dataframe['family_size'] > 1, 'is_alone'] = 0
    
    # Create new feature
    dataframe['title'] =  dataframe['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

    # Remove the 'name' feature
    dataframe.drop(["Name"], axis=1, inplace=True)

    # Mark the 'title' as 'rare' if the value is less than 10
    rare_titles = (dataframe['title'].value_counts() < 10)
    rare_titles

    dataframe.loc[dataframe.title == 'Miss', 'title'] = 'Mrs'
    dataframe['title'] = dataframe.title.apply(lambda x: 'rare' if rare_titles[x] else x)

    dataframe["title"] = le.fit_transform(dataframe["title"])
    #dataframe["title"] = dataframe["title"].replace(['Mr', 'Mrs', 'Master', 'rare'], [0, 1, 2, 3])
    # Print the head to verify the data
    dataframe.head()
    
    return dataframe

In [None]:
df_train = feat_engg(df_train)
df_test = feat_engg(df_test)

In [None]:
from sklearn.preprocessing import StandardScaler

# Create an instance of the scaler
# scaler = StandardScaler()

# # Fit on training data
# scaler.fit(df_train)

# # Transform both training and test data
# X_train_scaled = scaler.transform(df_train)
# X_test_scaled = scaler.transform(df_test)


# df_train = pd.DataFrame(X_train_scaled, columns=df_train.columns)
# df_train.head()

In [None]:
from sklearn import tree

#x_train, x_test, y_train, y_test = train_test_split(df_train, target, test_size=0.2, random_state=117)

clf = tree.DecisionTreeClassifier()

#clf.fit(x_train, y_train)

#clf.fit(df_train, target)

# Define the parameter grid to tune the hyperparameters
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2,4,6,8,10,12, 20, 30, None],
    'min_samples_split': np.arange(2, 10, 1),
    'min_samples_leaf': np.arange(1, 10, 1),
    'splitter':["best","random"]
}

clf_dt = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring = 'accuracy')
clf_dt.fit(df_train, target)

best_dtree_reg = clf_dt.best_estimator_ # Get the best estimator from the grid search

y_pred = best_dtree_reg.predict(df_test)

print('Best Criterion:', clf_dt.best_estimator_.get_params()['criterion'])
print('Max depth:', clf_dt.best_estimator_.get_params()['max_depth'])
print('Min Samples split:', clf_dt.best_estimator_.get_params()['min_samples_split'])
print('Min Samples Leaf:', clf_dt.best_estimator_.get_params()['min_samples_leaf'])
print('Best Splitter:', clf_dt.best_estimator_.get_params()['splitter'])

print(f"\nBest score:", clf_dt.best_score_)

In [None]:
# Retrain with best hyper-parameters
clf = tree.DecisionTreeClassifier(
    criterion= clf_dt.best_estimator_.get_params()['criterion'],
    max_depth= clf_dt.best_estimator_.get_params()['max_depth'],
    min_samples_split= clf_dt.best_estimator_.get_params()['min_samples_split'],
    min_samples_leaf= clf_dt.best_estimator_.get_params()['min_samples_leaf'],
    splitter= clf_dt.best_estimator_.get_params()['splitter'])
clf.fit(df_train, target)
pred = clf.predict(df_test)
print(pred)

In [None]:
# Eperiment to improve accuracy
# Method: tree Pruning
# Compute the cost-complexity pruning path

clf_rf = RandomForestClassifier(n_estimators=100)
clf_rf.fit(df_train, target)

pred = clf_rf.predict(df_test)
# print("score:", accuracy_score(y_test, pred))


#clf_rf.fit(feats, target)
#pred = clf_rf.predict(df_test)

In [None]:
# print("feats:", feats.columns)
# print("test feats:",df_test.columns)

# #df_test.isnull().sum()

# pred = clf.predict(x_test)
# score = accuracy_score(y_test, pred)
# #pred = clf.predict(df_test)
# #print("predictions", pred)
# print("score:", score)

In [None]:
df_subm = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
y = pd.read_csv("/kaggle/input/titanic/test.csv")
df_subm.head()

In [None]:
submission = pd.DataFrame({"PassengerId": y["PassengerId"],
       "Survived": pred})
submission.head()
submission.to_csv("submission.csv", index=False)
submission.head()