In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from seaborn import FacetGrid

#import data from kaggle library
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")


data = pd.concat([train_data.drop('Survived', axis=1), test_data], ignore_index=True)
print(data.isnull().sum())
data.head()




PassengerId       0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
#plots
g = sns.FacetGrid(train_data, col="Sex", row="Survived", margin_titles=True)

# Map a histogram plot
g.map(sns.histplot, "Age", kde=False)

plt.show()

sns.barplot(x='Sex', y ='Survived', data=train_data)




In [2]:
#Family Size calculation
data['Family Size'] = data['SibSp'] + data['Parch']
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.')

#data['TicketPrefix'] = data['Ticket'].str.extract(r'(^[A-Za-z0-9/.]+)')
data['TicketPrefix'] = data['Ticket'].str.split(' ').str[0]
data['TicketPrefix'] = data['TicketPrefix'].apply(lambda x: x if x.isalpha() else 'None')

data['Age'] = data.groupby('Title')['Age'].apply(lambda x: x.fillna(x.mean())).reset_index(level=0, drop=True)
data['Fare'].fillna(data['Fare'].median())
data['Embarked'].fillna(data['Embarked'].mode())







Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family Size,Title,TicketPrefix
0,1,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S,1,Mr,
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C,1,Mrs,PC
2,3,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S,0,Miss,
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S,1,Mrs,
4,5,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S,0,Mr,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,3,"Spector, Mr. Woolf",male,32.252151,0,0,A.5. 3236,8.0500,,S,0,Mr,
1305,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.000000,0,0,PC 17758,108.9000,C105,C,0,Dona,PC
1306,1307,3,"Saether, Mr. Simon Sivertsen",male,38.500000,0,0,SOTON/O.Q. 3101262,7.2500,,S,0,Mr,
1307,1308,3,"Ware, Mr. Frederick",male,32.252151,0,0,359309,8.0500,,S,0,Mr,


In [3]:
#Ddropping all unnecessary columns
data.drop(['Name','Cabin','Ticket'], axis = 1)

#splitting the data 
X_train = data.iloc[:len(train_data)]
X_test = data.iloc[len(train_data):]
y_train = train_data['Survived']

#creating categorcial column list
categorical_cols = ['Embarked','Sex','Title','TicketPrefix']
print(X_train.shape, X_test.shape)



(891, 14) (418, 14)


In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

categorical_transformer = Pipeline(steps=[('onehot',OneHotEncoder(handle_unknown= 'ignore',sparse = False))])

preprocessor = ColumnTransformer(transformers = [('cat',categorical_transformer,categorical_cols)] )

#creating a Random Forest model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators= 35, max_depth= 9, min_samples_split=5,random_state= 0)
#Uncomment the following lines to try XGBoost model
#from xgboost import XGBClassifier
#model_xgb = XGBClassifier(n_estimators = 1000, learning_rate = 0.05)

my_pipeline = Pipeline(steps = [('preprocessor', preprocessor),('model',model)])

my_pipeline.fit(X_train, y_train)






In [13]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(my_pipeline, X_train,y_train, cv = 5, scoring = 'accuracy')
print("Cross Validation accuracy:", scores.mean())



Cross Validation accuracy: 0.7979725064339966




In [14]:
from sklearn.metrics import accuracy_score
predictions = my_pipeline.predict(X_train)
print("Training accuracy:", accuracy_score(y_train, predictions))


Training accuracy: 0.8080808080808081


In [9]:
#using GridSearch to get the best possible values to redefine the Random Forest Model
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [35,40,45,50],
    'model__max_depth': [7,8,9,10],
    'model__min_samples_split': [2,3,4,5]
}

grid_search = GridSearchCV(my_pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)




{'model__max_depth': 9, 'model__min_samples_split': 5, 'model__n_estimators': 35}




In [15]:
#Final Predictions uding Test Data
predictions = my_pipeline.predict(X_test)


In [16]:
#Getting the output including just the Passenger Id and Survival state
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predictions})
output.to_csv('Titanic_submission_5.csv', index = False)


