In [1]:
import pandas as pd
import numpy as np

# preprocessing
from sklearn.preprocessing import LabelEncoder

# modeling
import xgboost as xgb

# validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import classification_report

ModuleNotFoundError: No module named 'xgboost'

In [48]:
def describe_categorical(X):
    print(X[X.columns[X.dtypes == 'object']].describe())
    
def cstats(y_test, y_test_pred):
    return roc_auc_score(y_test, y_test_pred)

def get_original_datasets(idx):
    global combined
    
    train0 = pd.read_csv('data/train.csv')
    
    targets = train0.Survived
    train = combined.head(idx)
    test = combined.iloc[idx:]
    
    return train, test, targets

def combined_dataset():
    train = pd.read_csv("data/train.csv")
    test = pd.read_csv("data/test.csv")
    targets = train.Survived
    train.drop('Survived', 1, inplace=True)
    combined = train.append(test)
    combined.reset_index(inplace=True)
    combined.drop('index', inplace=True, axis=1)
    
    return combined, train.shape[0]

combined, idx = combined_dataset()

In [36]:
types = combined.columns.to_series().groupby(combined.dtypes).groups
for k,v in types.items():
    print(k, v)

float64 ['Age', 'Fare']
int64 ['PassengerId', 'Pclass', 'SibSp', 'Parch']
object ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [37]:
describe_categorical(combined)

                        Name   Sex    Ticket        Cabin Embarked
count                   1309  1309      1309          295     1307
unique                  1307     2       929          186        3
top     Connolly, Miss. Kate  male  CA. 2343  C23 C25 C27        S
freq                       2   843        11            6      914


In [38]:
# missing values

combined.isnull().sum()

PassengerId       0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [39]:
combined.Age.fillna(combined.Age.mean(), inplace=True)
combined.Fare.fillna(combined.Fare.mean(), inplace=True)

freq_port = combined['Embarked'].mode()[0]
combined.Embarked.fillna(freq_port, inplace=True)

combined['Cabin'] = combined['Cabin'].fillna('X')

In [40]:
combined.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

convert categorical features with low unique values to numeric. I drop the others

In [41]:
combined.drop(['Name', 'Ticket', 'Cabin'], inplace=True, axis=1)

combined['Sex'] = combined['Sex'].map({'male':1,'female':0})
le = LabelEncoder()
combined['Embarked'] = le.fit_transform(combined['Embarked'])
combined['Embarked'].head()

0    2
1    0
2    2
3    2
4    2
Name: Embarked, dtype: int64

In [42]:
types = combined.columns.to_series().groupby(combined.dtypes).groups
for k,v in types.items():
    print(k, v)

float64 ['Age', 'Fare']
int64 ['PassengerId', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']


In [43]:
train, test, targets = get_original_datasets(idx)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(train, targets, test_size=0.3, random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(623, 8)
(268, 8)
(623,)
(268,)


In [46]:
model = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(X_train, y_train)

In [47]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [51]:
print('training set:', cstats(y_train, y_train_pred))
print('validation set:', cstats(y_test, y_test_pred))

training set: 0.976714171059
validation set: 0.790476190476


In [55]:
kfold = KFold(n_splits=10, random_state=7)
scores = cross_val_score(model, X_train, y_train, cv=kfold)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.78 (+/- 0.09)


In [58]:
print(classification_report(y_test, y_test_pred))

             precision    recall  f1-score   support

          0       0.83      0.88      0.86       168
          1       0.78      0.70      0.74       100

avg / total       0.81      0.81      0.81       268

