# **Titanic - Machine Learning from Disaster**

## **Importing Necessary Modules**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [None]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

In [None]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

In [None]:
dataset=[train_data,test_data]
train_data.info()

In [None]:
train_data.describe()

## **Data Visualization And Analysis**

In [None]:
fig = plt.figure(figsize=(10,5))

plt.title('Training Datset')
sns.heatmap(train_data.isnull(),yticklabels=False,cmap='viridis')
plt.show()

In training dataset, Column Age, Cabin and Embarked have null values

In [None]:
plt.figure(figsize=(5,5))
plt.bar(list(train_data['Survived'].value_counts().keys()),list(train_data['Survived'].value_counts()),color=["r","b"])
plt.title('Survival Ratio')
plt.show()

In [None]:
train_data['Survived'].value_counts()

From above graph and code we can see 342 people have survived and 549 have not.

In [None]:
sns.countplot(data=train_data,x='Sex',hue='Survived')

In [None]:
sns.countplot(train_data['Pclass'])

In [None]:
plt.figure(figsize=(5,5))
plt.hist(train_data['Age'])
plt.title('Distribution of Age')
plt.xlabel("Age")
plt.show()

In [None]:
sns.boxplot(y = train_data['Age'], x = train_data['Sex'])

In [None]:
sns.countplot(data=train_data,x='Embarked',hue='Survived')

In [None]:
plt.figure(figsize = (10, 6))
# using .heatmap() of seaborn to understand better relationship of variables 
sns.heatmap(train_data.corr(), annot=True)
plt.title('Corelation Matrix')

## **Data Cleaning**

In [None]:
train_null_cols=train_data.columns[train_data.isna().any()].to_list()
test_null_cols=test_data.columns[test_data.isna().any()].to_list()
print('train_null_cols : ',train_null_cols)
print('test_null_cols : ',test_null_cols)

In [None]:
#dropping Name,Ticket,Cabin
train_data.drop(columns=['Name','Ticket','Cabin'],axis=1,inplace=True)

In [None]:
train_data.info()

In [None]:
print(train_data['Embarked'].value_counts())
for data in dataset:
    data['Sex']=data['Sex'].map({'female':0,'male':1})
    data['Embarked']=data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [None]:
train_data.head()

In [None]:
test_data.drop(columns=['Name','Ticket','Cabin',],axis=1,inplace=True)

## **Handling Missing Values**

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
# Finding the null columns in train and test datasets
train_null_cols=train_data.columns[train_data.isna().any()].to_list()
test_null_cols=test_data.columns[test_data.isna().any()].to_list()
print('train_null_cols : ',train_null_cols)
print('test_null_cols : ',test_null_cols)

In [None]:
# imputation of age in dataset by using mean
for data in dataset:
    data['Age'].fillna(data['Age'].mean(),inplace=True)

In [None]:
#imputing Embarked column in traing dataset
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0],inplace=True)

In [None]:
#imputing Fare column in testing dataset
test_data['Fare'].fillna(test_data['Fare'].mode()[0],inplace=True)

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
x=train_data.drop(['Survived'],axis=1)
y=train_data['Survived']

## **Split Data In Train and Test**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.3,random_state=0)

## **Applying XGBoost Classifier Model**

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(objective='binary:logistic',booster = 'gbtree',eval_metric='logloss', gamma=5,learning_rate = 0.1, max_depth = 5, n_estimators = 100,colsample_bytree=1)
xgb.fit(x, y)
predictions_xgb=xgb.predict(test_data)

Survived=pd.Series(predictions_xgb,name='Survived')
ans=pd.concat([test_data['PassengerId'],Survived],axis=1)
ans.to_csv('XGB_Ans.csv',index=False)