In [80]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

In [59]:
train_data = pd.read_csv('../data/titanic_train.csv')
test_data = pd.read_csv('../data/titanic_test.csv')

test_data['Survived'] = 0
test_data['Test'] = False
train_data['Test'] = True

In [5]:
train_data.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [12]:
all_data = pd.concat([train_data, test_data])
all_data.shape

(1309, 13)

In [11]:
all_data.columns

Index([u'Age', u'Cabin', u'Embarked', u'Fare', u'Name', u'Parch',
       u'PassengerId', u'Pclass', u'Sex', u'SibSp', u'Survived', u'Test',
       u'Ticket'],
      dtype='object')

In [41]:
numeric_columns = ['Age', 'Pclass', 'Fare', 'Parch', 'SibSp']
categorical_columns = ['Sex', 'Embarked']
drop_columns = ['Name', 'Cabin']

In [22]:
all_data = all_data.drop(drop_columns)
all_data[all_data['Test']].dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


### Test on raw data

In [25]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Test             0
dtype: int64

In [28]:
train_data.dropna(inplace=True)

In [29]:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
Test              bool
dtype: object

In [78]:
numeric = train_data[numeric_columns]
categ = train_data[categorical_columns]
categ = pd.get_dummies(categ)

xgb_data = pd.concat([numeric, categ], axis=1)
target = train_data['Survived']

data_train, data_test, target_train, target_test = train_test_split(
    xgb_data, target, test_size=0.2, random_state=42)

In [51]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams

In [64]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'], eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [67]:
xgb_data.dtypes

0    object
dtype: object

In [81]:
xgb1 = XGBClassifier()

xgb1.fit(data_train, target_train, eval_metric='auc')

accuracy_score(xgb1.predict(data_test), target_test)

0.7988826815642458