[View in Colaboratory](https://colab.research.google.com/github/shashank2806/kaggle-titanic/blob/improved-accuracy/titanic_xgboost.ipynb)


#Titanic: Machine Learning from Disaster

Predict survival on the Titanic  with ML.

In [0]:
# Upload dataset form local machine to colab
from google.colab import files
files.upload()

In [0]:
# unzip the data
!unzip titanic-kaggle.zip

In [5]:
# import dependencies
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

train_file_path = 'train.csv'

titanic_data = pd.read_csv(train_file_path)

# target object
y = titanic_data.Survived

# input features
numeric_features = ['Pclass', 'Age', 'Parch', 'Fare']
categorical_features = [ 'Sex', 'SibSp', 'Parch', 'Embarked']
X_numeric = titanic_data[numeric_features]
X_categorical = titanic_data[categorical_features]
X_numeric.head()

Unnamed: 0,Pclass,Age,Parch,Fare
0,3,22.0,0,7.25
1,1,38.0,0,71.2833
2,3,26.0,0,7.925
3,1,35.0,0,53.1
4,3,35.0,0,8.05


In [4]:
# view data
# X.head()
titanic_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
# check if any feild is null
# X[X.isnull().any(axis=1)]
print(titanic_data.isnull().sum())
print(titanic_data.nunique().sum)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
<bound method Series.sum of PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64>


In [0]:
# encode to one hot
one_hot_encoded_features_train = pd.get_dummies(X_categorical)
# load test data and preprocess in same way as train data
test_file_path = 'test.csv'
test_data = pd.read_csv(test_file_path)
test_data_numeric = test_data[numeric_features]
test_data_categorical = test_data[categorical_features]
one_hot_encoded_features_test = pd.get_dummies(test_data_categorical)
# align train and test data
aligned_one_hot_encoded_features_train, aligned_one_hot_encoded_features_test=one_hot_encoded_features_train.align(one_hot_encoded_features_test, join='left', axis=1)
# one_hot_encoded_features.isnull().sum()

In [7]:
X = X_numeric.add(aligned_one_hot_encoded_features_train, fill_value=0)
X_test = test_data_numeric.add(aligned_one_hot_encoded_features_test, fill_value=0)
X.isnull().sum()

Age           177
Embarked_C      0
Embarked_Q      0
Embarked_S      0
Fare            0
Parch           0
Pclass          0
Sex_female      0
Sex_male        0
SibSp           0
dtype: int64

In [0]:
# impute missing values
from sklearn.preprocessing import Imputer
my_imputer = Imputer()
X_imputed  = my_imputer.fit_transform(X)
X_test_imputed = my_imputer.fit_transform(X_test)

In [0]:
# X_inputed is numpu array. so we have to cast it back to dataframe
X_imputed = pd.DataFrame(X_imputed)
X_test_imputed = pd.DataFrame(X_test_imputed)

In [10]:
# we can see that we have lost column titles.
X_imputed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,22.0,0.0,0.0,1.0,7.25,0.0,3.0,0.0,1.0,1.0
1,38.0,1.0,0.0,0.0,71.2833,0.0,1.0,1.0,0.0,1.0
2,26.0,0.0,0.0,1.0,7.925,0.0,3.0,1.0,0.0,0.0
3,35.0,0.0,0.0,1.0,53.1,0.0,1.0,1.0,0.0,1.0
4,35.0,0.0,0.0,1.0,8.05,0.0,3.0,0.0,1.0,0.0


In [0]:
#Since the order of the columns does not change after imputation
#you can add the titles back like this
X_imputed.columns = X.columns
X_test_imputed.columns = X_test.columns

In [13]:
X_imputed.head()

Unnamed: 0,Age,Embarked_C,Embarked_Q,Embarked_S,Fare,Parch,Pclass,Sex_female,Sex_male,SibSp
0,22.0,0.0,0.0,1.0,7.25,0.0,3.0,0.0,1.0,1.0
1,38.0,1.0,0.0,0.0,71.2833,0.0,1.0,1.0,0.0,1.0
2,26.0,0.0,0.0,1.0,7.925,0.0,3.0,1.0,0.0,0.0
3,35.0,0.0,0.0,1.0,53.1,0.0,1.0,1.0,0.0,1.0
4,35.0,0.0,0.0,1.0,8.05,0.0,3.0,0.0,1.0,0.0


Now that we have taken care of missing values. We are good to go further.

In [0]:
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X_imputed, y, test_size=0.2)

In [15]:
# model
# titanic_model = LogisticRegression(C=1e3)
# titanic_model.fit(train_X, train_y)
# val_predictions = titanic_model.predict(val_X)
from xgboost import XGBClassifier
titanic_model = XGBClassifier(n_estimators=1000, learning_rate=0.005)
titanic_model.fit(train_X, train_y,verbose=False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.005, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [14]:
# print accuracy
train_accuracy = titanic_model.score(train_X, train_y)
val_accuracy = titanic_model.score(val_X, val_y)
print('train_accuracy: ',train_accuracy)
print('val_accuracy: ',val_accuracy)

train_accuracy:  0.9353932584269663
val_accuracy:  0.7988826815642458


  if diff:
  if diff:


In [16]:
# model on full data
titanic_model_on_full_data = XGBClassifier(n_estimators=1000, learning_rate=0.005)
titanic_model_on_full_data.fit(X_imputed, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.005, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [17]:
# make predictions which we will submit.
test_preds = titanic_model_on_full_data.predict(X_test_imputed)

  if diff:


In [0]:
# The lines below shows you how to save your data in the format needed to score it in the competition
output = pd.DataFrame({'PassengerId': test_data.PassengerId,
                       'Survived': test_preds})

In [0]:
# output to csv
output.to_csv('submission.csv', index=False)

In [0]:
files.download('submission.csv')

In [0]:
!ls

gender_submission.csv  submission.csv  titanic-kaggle.zip
sample_data	       test.csv        train.csv


This submission scored around 76% accuracy.


In [26]:
!pip install kaggle

Collecting kaggle
[?25l  Downloading https://files.pythonhosted.org/packages/c6/78/832b9a9ec6b3baf8ec566e1f0a695f2fd08d2c94a6797257a106304bfc3c/kaggle-1.4.7.1.tar.gz (52kB)
[K    100% |████████████████████████████████| 61kB 4.1MB/s 
Collecting python-slugify (from kaggle)
  Downloading https://files.pythonhosted.org/packages/00/ad/c778a6df614b6217c30fe80045b365bfa08b5dd3cb02e8b37a6d25126781/python-slugify-1.2.6.tar.gz
Collecting Unidecode>=0.04.16 (from python-slugify->kaggle)
[?25l  Downloading https://files.pythonhosted.org/packages/59/ef/67085e30e8bbcdd76e2f0a4ad8151c13a2c5bce77c85f8cad6e1f16fb141/Unidecode-1.0.22-py2.py3-none-any.whl (235kB)
[K    100% |████████████████████████████████| 235kB 7.5MB/s 
[?25hBuilding wheels for collected packages: kaggle, python-slugify
  Running setup.py bdist_wheel for kaggle ... [?25l- \ done
[?25h  Stored in directory: /root/.cache/pip/wheels/44/2c/df/22a6eeb780c36c28190faef6252b739fdc47145fd87a6642d4
  Running setup.py bdist_wheel for

In [32]:
!ls

gender_submission.csv  submission.csv  titanic-kaggle.zip
sample_data	       test.csv        train.csv
