# Ensemble of all the models

In [1]:
import os 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
import Cython

import torch
#import torch.nn as nn
#import torch.nn.functional as F
from torch.autograd import Variable
#from torch.utils.data import Dataset, DataLoader

from helpers import *
from data import create_csv_submission

In [2]:
DATA_PATH = '../twitter-datasets/'
MODEL_PATH = '../models/'

TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg_full.txt') # 2'500'000 negative tweets
TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos_full.txt') # 2'500'000 positive tweets
TEST_PATH = os.path.join(DATA_PATH, 'test_data.txt')

In [4]:
x_text_train, y_train_full = load_data_and_labels(TRAIN_POS_PATH, TRAIN_NEG_PATH)
x_text_test = load_test_data(TEST_PATH)

In [5]:
len(x_text_train)

2500000

In [6]:
vector_length = 100

In [7]:
word_vectors = gensim.models.keyedvectors.KeyedVectors.load(MODEL_PATH + 'twitter_word_vectors.bin')

In [8]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=1)
matrix = vectorizer.fit_transform([x for x in x_text_train + x_text_test])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [9]:
x_text_train, _, y_train_full, _ = train_test_split(x_text_train, y_train_full, test_size=0.995, random_state=42)
del _

In [10]:
len(x_text_train)

12500

In [11]:
sequence_length = 74
print('Maximum sequence length of train and test data:', sequence_length)

x_text_train_pad = pad_sentences(x_text_train, padding_word="<PAD/>", sequence_length=sequence_length)
x_text_test_pad = pad_sentences(x_text_test, padding_word="<PAD/>", sequence_length=sequence_length)

del x_text_train
del x_text_test

Maximum sequence length of train and test data: 74


In [12]:
def get_predictions(model, x, word_vectors, vector_length):
    test_output = model(Variable(get_tweets_tensor(x, word_vectors, vector_length)))
    return torch.max(test_output, 1)[1].data.numpy().squeeze()

In [13]:
def get_predictions_tfidf(model, x, word_vectors, vector_length, tfidf):
    test_output = model(Variable(get_tweets_tensor_tfidf(x, word_vectors, vector_length, tfidf)))
    return torch.max(test_output, 1)[1].data.numpy().squeeze()

## Model 1

In [14]:
model1 = torch.load('./ensemble_models/model1.pth')



In [15]:
y_train_pred1 = get_predictions(model1, x_text_train_pad, word_vectors, vector_length)

In [16]:
accuracy_score(y_train_pred1, y_train_full)

0.88200000000000001

In [17]:
y_test_pred1 = get_predictions(model1, x_text_test_pad, word_vectors, vector_length)

In [18]:
del model1

## Model 2

In [25]:
x = np.empty((0,3))

In [26]:
y = np.array([1,2,3])
x = np.vstack([x, y])
x = np.vstack([x, y])

In [27]:
x

array([[ 1.,  2.,  3.],
       [ 1.,  2.,  3.]])

In [19]:
model2 = torch.load('./ensemble_models/model2.0.pth')
y_train_pred2_0 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_0 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [20]:
model2 = torch.load('./ensemble_models/model2.1.pth')
y_train_pred2_1 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_1 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [21]:
model2 = torch.load('./ensemble_models/model2.2.pth')
y_train_pred2_2 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_2 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [22]:
model2 = torch.load('./ensemble_models/model2.3.pth')
y_train_pred2_3 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_3 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [23]:
model2 = torch.load('./ensemble_models/model2.4.pth')
y_train_pred2_4 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_4 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [24]:
model2 = torch.load('./ensemble_models/model2.5.pth')
y_train_pred2_5 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_5 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [25]:
model2 = torch.load('./ensemble_models/model2.6.pth')
y_train_pred2_6 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_6 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [26]:
model2 = torch.load('./ensemble_models/model2.7.pth')
y_train_pred2_7 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_7 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [27]:
model2 = torch.load('./ensemble_models/model2.8.pth')
y_train_pred2_8 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_8 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [28]:
model2 = torch.load('./ensemble_models/model2.9.pth')
y_train_pred2_9 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_9 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [54]:
y_train_pred2 = np.vstack([y_train_pred2_0, y_train_pred2_1, y_train_pred2_2, \
                           y_train_pred2_3, y_train_pred2_4, y_train_pred2_5, \
                           y_train_pred2_6, y_train_pred2_7, y_train_pred2_8, y_train_pred2_9])

y_test_pred2 = np.vstack([y_test_pred2_0, y_test_pred2_1, y_test_pred2_2, \
                          y_test_pred2_3, y_test_pred2_4, y_test_pred2_5, \
                          y_test_pred2_6, y_test_pred2_7, y_test_pred2_8, y_test_pred2_9])

In [77]:
y_train_pred2 = np.median(y_train_pred2, axis=0)
y_test_pred2 = np.median(y_test_pred2, axis=0)

## Model 3

In [29]:
model3 = torch.load('./ensemble_models/model3.pth')

In [30]:
y_train_pred3 = get_predictions(model3, x_text_train_pad, word_vectors, vector_length)

In [31]:
accuracy_score(y_train_pred3, y_train_full)

0.87575999999999998

In [32]:
y_test_pred3 = get_predictions(model3, x_text_test_pad, word_vectors, vector_length)

In [33]:
del model3

## Model 4

In [34]:
model4 = torch.load('./ensemble_models/model4.pth')

In [35]:
y_train_pred4 = get_predictions_tfidf(model4, x_text_train_pad, word_vectors, vector_length, tfidf)

In [36]:
accuracy_score(y_train_pred4, y_train_full)

0.83320000000000005

In [37]:
y_test_pred4 = get_predictions_tfidf(model4, x_text_test_pad, word_vectors, vector_length, tfidf)

In [38]:
del model4

## Model 5

In [39]:
model5 = torch.load('./ensemble_models/model5.pth')

In [40]:
y_train_pred5 = get_predictions_tfidf(model5, x_text_train_pad, word_vectors, vector_length, tfidf)

In [41]:
accuracy_score(y_train_pred5, y_train_full)

0.78727999999999998

In [42]:
y_test_pred5 = get_predictions_tfidf(model5, x_text_test_pad, word_vectors, vector_length, tfidf)

In [43]:
del model5

## Ensemble

In [106]:
ensemble_train = np.vstack([y_train_pred1, y_train_pred2, y_train_pred3, y_train_pred4, y_train_pred5]).T

In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [108]:
clf = RandomForestClassifier(oob_score=True, criterion='gini', random_state=42)

param_grid = {
    "n_estimators": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    "max_depth":    [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
}

rf_grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, verbose=1)

In [109]:
rf_grid.fit(ensemble_train, y_train_full)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  1.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [110]:
print('Best hyperparameters:', rf_grid.best_params_)
best_rf = rf_grid.best_estimator_

Best hyperparameters: {'max_depth': 40, 'n_estimators': 20}


In [111]:
rf_grid.cv_results_['mean_test_score'].reshape((10,10))

array([[ 0.89768,  0.89856,  0.89856,  0.89856,  0.89856,  0.89856,
         0.89856,  0.89856,  0.89856,  0.89856],
       [ 0.89856,  0.89856,  0.89856,  0.89856,  0.8976 ,  0.89856,
         0.89856,  0.89856,  0.89856,  0.89856],
       [ 0.89856,  0.89856,  0.8976 ,  0.89856,  0.89856,  0.89856,
         0.89856,  0.89856,  0.89856,  0.89856],
       [ 0.89856,  0.89888,  0.89856,  0.89856,  0.89856,  0.89856,
         0.89856,  0.89856,  0.89856,  0.89856],
       [ 0.89856,  0.89856,  0.89864,  0.89856,  0.89856,  0.89856,
         0.89856,  0.89856,  0.89856,  0.89856],
       [ 0.89856,  0.8976 ,  0.89856,  0.89856,  0.8976 ,  0.89856,
         0.89856,  0.89856,  0.89856,  0.89856],
       [ 0.89856,  0.89856,  0.89856,  0.89856,  0.89856,  0.89856,
         0.89856,  0.89856,  0.89856,  0.89856],
       [ 0.89856,  0.89856,  0.89856,  0.89856,  0.89856,  0.89856,
         0.89856,  0.89856,  0.89856,  0.89856],
       [ 0.89856,  0.89864,  0.89856,  0.89856,  0.89856,  0.898

In [112]:
ensemble_test = np.vstack([y_test_pred1, y_test_pred2, y_test_pred3, y_test_pred4, y_test_pred5]).T

In [113]:
final_preds = best_rf.predict(ensemble_test)

In [114]:
final_preds[final_preds == 0] = -1

In [115]:
final_preds

array([-1, -1, -1, ..., -1,  1, -1])

In [116]:
final_preds.mean()

0.062799999999999995

In [117]:
best_rf.oob_score_

0.89895999999999998

In [None]:
0.89824000000000004

In [118]:
create_csv_submission(np.arange(1,10001), final_preds, 'kaggle_ensemble.csv')