## Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()

## Get data

In [3]:
x_train_raw = pd.read_csv('../data/X_train.csv').drop('id', axis=1)
y_train_raw = pd.read_csv('../data/y_train.csv').drop('id', axis=1)
print(x_train_raw.shape)
print(y_train_raw.shape)

(4800, 1000)
(4800, 1)


In [4]:
y_counts = y_train_raw.y.value_counts()
y_counts

1    3600
2     600
0     600
Name: y, dtype: int64

## Cross validation

In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
import time

In [10]:
# Get balanced accuracy score
def score(true, pred):
    return balanced_accuracy_score(true, pred)

# Oversample using SMOTE
def oversample(x_data, y_data):
    smote = SMOTE(ratio='not majority')
    return smote.fit_sample(x_data, y_data)

# Oversample and fit model for a CV split
def run_fold(x_train, y_train, x_test, y_test, model):
    # Oversample data
    x_train_sm, y_train_sm = oversample(x_train, y_train)
    # Fit model
    model.fit(x_train_sm, y_train_sm)
    y_train_pred = model.predict(x_train_sm)
    train_score = score(y_train_sm, y_train_pred)
    y_test_pred = model.predict(x_test)
    test_score = score(y_test, y_test_pred)
    return test_score, train_score

def cross_validate(x_data, y_data, model):
    # Split data into folds
    n_splits = 10
    kf = KFold(n_splits=n_splits, shuffle=True)
    folds = kf.split(x_data)
    test_scores = []
    train_scores = []
    times = []
    split = 0
    for train_index, test_index in folds:
        split += 1
        print('Running split {}/{}'.format(split, n_splits))
        x_train = x_data[train_index]
        y_train = y_data[train_index]
        x_test = x_data[test_index]
        y_test = y_data[test_index]
        start_time = time.time()
        test_score, train_score = run_fold(x_train, y_train, x_test, y_test, model)
        end_time = time.time()
        total_time = round(end_time-start_time, ndigits=0)
        test_scores.append(test_score)
        train_scores.append(train_score)
        times.append(end_time-start_time)
        print('Test score = {}\nTrain score = {}\nTime = {}s\n'.format(test_score, train_score, total_time))
    print('Average test score: {}\nAverage train score: {}\nTotal time: {}s'.format(np.mean(test_scores), np.mean(train_scores), np.sum(times)))
    return test_scores, train_scores, times

In [7]:
x_data = x_train_raw.values
y_data = y_train_raw.values.ravel()

## Random forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_features=round(len(x_sm[0])/3))
test_scores, train_scores, times = cross_validate(x_data, y_data, rf)

Running split 1/10




Test score = 0.5334557640674337
Train score = 0.9981441385709867
Time = 26.0s

Running split 2/10


## Boosting

In [12]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBClassifier(learning_rate = 0.1, max_depth = 3)
test_scores, train_scores, times = cross_validate(x_data, y_data, xgb_model)

Running split 1/10
Test score = 0.6736747634132997
Train score = 0.924994827229464
Time = 176.0s

Running split 2/10
Test score = 0.6112909921129099
Train score = 0.9256978653530378
Time = 181.0s

Running split 3/10


## Output predictions

In [None]:
# x_test_raw = pd.read_csv('../data/X_test.csv').drop('id', axis=1)
# def output_pred(model, name):
#     y_test_pred = rf.predict(x_test_raw)
#     output = pd.DataFrame({'id':[float(i) for i in range(0, len(x_test_raw))], 'y': y_test_pred})
#     output.to_csv(name,index=False)

In [None]:
# output_pred(rf, 'rf.csv')