# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBClassifier
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import Data

In [None]:
data=pd.read_csv("/kaggle/input/titanic/train.csv")
data.head(10)

# Preprocessing Data

In [None]:
data.isnull().mean()

In [None]:
data.drop(["PassengerId","Cabin","Name","Ticket"],inplace=True,axis=1)
data['Age']=data['Age'].fillna(data['Age'].median())
data['Embarked']=data['Embarked'].fillna(data['Embarked'].mode()[0])
data['Fare'][data['Fare']>400]

In [None]:
data["Sex"]=data["Sex"].map({"female":0,"male":1})
data=pd.get_dummies(data,drop_first=True)
data.head(10)

In [None]:
sns.pairplot(data,vars=["Pclass","Age","SibSp","Parch","Fare","Sex","Embarked_Q","Embarked_S"],kind="reg",hue="Survived")

In [None]:
correlation=data.corr()
plt.figure(figsize=(14,12),dpi=80)
sns.heatmap(correlation,annot=True)

In [None]:
data.drop(["Fare","Embarked_Q","SibSp"],inplace=True,axis=1)
data.head()

# Normalization

In [None]:
mm_scale=MinMaxScaler()
data_scaled=pd.DataFrame(mm_scale.fit_transform(data),columns=data.columns)
data_scaled.head()

In [None]:
yScaled=data["Survived"]
xScaled=data_scaled.drop("Survived",axis=1)
xScaled.head()

# Feature Selection

In [None]:
lm=LinearRegression()
lm.fit(xScaled,yScaled)
rfe=RFE(lm,4)
rfe=rfe.fit(xScaled,yScaled)
list(zip(xScaled.columns,rfe.support_,rfe.ranking_))

In [None]:
vif=pd.DataFrame()
vif['features']=xScaled.columns
vif['VIF']=[variance_inflation_factor(xScaled.values,i) for i in range(xScaled.shape[1])]
vif['VIF']=round(vif['VIF'],3)
vif=vif.sort_values(by="VIF",ascending=False)
vif

In [None]:
xc=sm.add_constant(xScaled)
lm=sm.OLS(yScaled,xc).fit()
print(lm.summary())

In [None]:
xc.drop("Parch",inplace=True,axis=1)
xc.drop("const",inplace=True,axis=1)

In [None]:
sns.countplot(yScaled)

# Build Model

In [None]:
model=XGBClassifier(learning_rate=0.05, max_depth=4, n_estimators=81, nthread=-1, scale_pos_weight=1, random_state=14)
model.fit(xc,yScaled)

In [None]:
test=pd.read_csv("/kaggle/input/titanic/test.csv")
test.head(10)

In [None]:
passengerId=test['PassengerId']
test.drop(["PassengerId","Name","SibSp","Parch","Ticket","Fare","Cabin"],inplace=True,axis=1)
test.head()

In [None]:
test['Sex']=test['Sex'].map({"female":0,"male":1})
test=pd.get_dummies(test,drop_first=True)
test.head()

In [None]:
test.drop("Embarked_Q",inplace=True,axis=1)
test["Age"]=test["Age"].fillna(test["Age"].median())
test_scaled=pd.DataFrame(mm_scale.fit_transform(test),columns=test.columns)
test_scaled.head()

# Prediction

In [None]:
pred=model.predict(test_scaled)
pred

In [None]:
pred_data=pd.DataFrame(pred,columns=['Survived'])
data_target=pd.concat([passengerId,pred_data],axis=1)
data_target

In [None]:
data_target.to_csv('my_submission.csv',index=False)
print("Your submission was successfully saved!")