In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
train=pd.read_csv("../input/titanic/train.csv")
test=pd.read_csv("../input/titanic/test.csv")

# Train Data

In [None]:
#do the basic exploration things and find the number of null values
#train.describe()
#train.info()
#train.isnull().any()
train.isnull().sum()

In [None]:
#Dropping the columns which we dont need and that dont effect much
train.drop(columns=["PassengerId","Name","Ticket","Cabin"],axis=1,inplace=True)

In [None]:
#plotting null values
plt.figure(figsize=(5,5))
sns.heatmap(train.isnull(),yticklabels=False,cbar=False)

In [None]:
#dropping only the rows with null values in embarked column, can also use the below methods
#train[train['Embarked'].isnull()].index.tolist()
#train = train[train['Embarked'].notna()]

train.dropna(subset=['Embarked'],inplace=True)
train.Age.fillna(train.Age.mean(),inplace=True)

In [None]:
train.isnull().sum()

# EDA

In [None]:
train.hist(figsize=(10,10),bins = 29, color="#107009AA")
plt.title("Features Distribution")
plt.show()

In [None]:
g = sns.FacetGrid(train, col='Survived')
g = g.map(sns.distplot, "Age")

# feature analysis

In [None]:
print(train.Sex.unique())
print(train.Embarked.unique())

In [None]:
train['Sex']=train['Sex'].map({'male':0,'female':1})
train['Embarked']=train['Embarked'].map({'S':0,'C':1,'Q':2})

In [None]:
#oulier analysis using zscore
from scipy import stats
import numpy as np
a=np.mean(train.Fare)
print(a)
b=np.std(train.Fare)
print(b)

z = np.abs(stats.zscore(train))
print(z)
threshold = 3

In [None]:
train = train[(z < 3).all(axis=1)]

In [None]:
#oulier analysis using inter quartile range IQR
from collections import Counter
def detect_outliers(df,n,features):
    outlier_indices = []
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        outlier_step = 1.5 * IQR
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    return multiple_outliers   
Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])

In [None]:
train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

# Test data

In [None]:
test.isnull().sum()

In [None]:
test.drop(columns=["PassengerId","Name","Ticket","Cabin"],axis=1,inplace=True)

In [None]:
test['Sex']=test['Sex'].map({'male':0,'female':1})
test['Embarked']=test['Embarked'].map({'S':0,'C':1,'Q':2})

In [None]:
test.Fare.fillna(test.Fare.mean(),inplace=True)
test.Age.fillna(train.Age.mean(),inplace=True)

# Split and Model

In [None]:
xtrain=train.iloc[:,1:]
ytrain=train.iloc[:,0]
xtest=test

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(booster = 'gbtree', gamma=5,learning_rate = 0.1, max_depth = 5, n_estimators = 100,colsample_bytree=1)
xgb.fit(xtrain, ytrain)
xgbpred=xgb.predict(xtest)

In [None]:
prediction = pd.DataFrame(xgbpred)
submission= pd.read_csv('../input/titanic/gender_submission.csv')
submission['Survived'] = prediction
submission.to_csv('xgbpred21%.csv', index = False)