In [None]:
# Importing 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
%matplotlib inline

In [None]:
# Import the data
test = pd.read_csv("../input/titanic-machine-learning-from-disaster/test.csv")
train = pd.read_csv("../input/titanic-machine-learning-from-disaster/train.csv")
test.head()

In [None]:
pd.crosstab(train["Sex"],train["Survived"]).plot.bar()

In [None]:
# Verifying missing values
train.isna().sum()

In [None]:
train.info()

In [None]:
sns.heatmap(train.corr(),annot=True)

In [None]:
# Check for string label 
for label,content in train.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
# Check for numerical label
for label,content in train.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
# This will turn all of the string value into category values
for label, content in train.items():
    if pd.api.types.is_string_dtype(content):
        train[label] = content.astype("category").cat.as_ordered()

In [None]:
# Turn categorical variables into numbers and fill missing
for label,content in train.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing value
        train[label+"is_missing"]=pd.isnull(content)
        # Turn categories into numbers and add+1
        train[label] = pd.Categorical(content).codes+1
for label,content in train.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Add a binary column which tell us if the data was missing
            train[label+"_is_missing"] = pd.isnull(content)
            # Fill missing numeric values with median
            train[label] = content.fillna(content.median())
        

In [None]:
# Prepare the data
X = train.drop("Survived",axis=1)
y = train["Survived"]

In [None]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Put models in a dictionary
models = {
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Gradiant Boosting": GradientBoostingClassifier(),
    "XGB": XGBClassifier()
}
# Create a function to fit and score model
def fit_and_score(models,X_train,X_test,y_train,y_test):
    # Make a dictionary to keep model score
    model_scores = {}
    # Loop through models
    for name,model in models.items():
        model.fit(X_train,y_train)
        #Evalute the model and append its score to model scores
        model_scores[name] = model.score(X_test,y_test)
    return model_scores

In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
scores = fit_and_score(models=models,X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test)

In [None]:
model_compare = pd.DataFrame(scores,index=["Accuracy"])
model_compare.T.plot.bar()

In [None]:
scores

In [None]:
ideal_model = XGBClassifier()
ideal_model.fit(X_train,y_train)
ideal_model.score(X_test,y_test)

In [None]:
# Predict
y_preds = ideal_model.predict(X_test)


In [None]:
# Make prediction on test sata
# Filling missing values
for label,content in test.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing value
        test[label+"is_missing"]=pd.isnull(content)
        # Turn categories into numbers and add+1
        test[label] = pd.Categorical(content).codes+1
# Fill numeric rows with the median
for label,content in test.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            # Add a binary column which tell us if the data was missing
            test[label+"_is_missing"] = pd.isnull(content)
            # Fill missing numeric values with median
            test[label] = content.fillna(content.median())
                

In [None]:
test.isna().sum()

In [None]:
test.head()

In [None]:
train.head()

In [None]:
test.drop("Fare_is_missing",axis=1,inplace=True)

In [None]:
# Making predictions
ypred = ideal_model.predict(test)
ypred.size

In [None]:
# Loading test predictions into csv
submission = pd.DataFrame({'PassengerId':test["PassengerId"],'Survived': ypred.astype(np.int32)});
submission.to_csv('submission3.csv', index=False)
submission