# Ensemble of all the models

In [2]:
import os 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
import Cython

import torch
#import torch.nn as nn
#import torch.nn.functional as F
from torch.autograd import Variable
#from torch.utils.data import Dataset, DataLoader

from helpers import *
from data import create_csv_submission

In [3]:
DATA_PATH = '../twitter-datasets/'
MODEL_PATH = '../models/'

TRAIN_NEG_PATH = os.path.join(DATA_PATH, 'train_neg_full.txt') # 2'500'000 negative tweets
TRAIN_POS_PATH = os.path.join(DATA_PATH, 'train_pos_full.txt') # 2'500'000 positive tweets
TEST_PATH = os.path.join(DATA_PATH, 'test_data.txt')

In [4]:
x_text_train, y_train_full = load_data_and_labels(TRAIN_POS_PATH, TRAIN_NEG_PATH)
x_text_test = load_test_data(TEST_PATH)

In [5]:
len(x_text_train)

2500000

In [6]:
vector_length = 100

In [8]:
word_vectors = gensim.models.keyedvectors.KeyedVectors.load(MODEL_PATH + 'twitter_word_vectors.bin')

In [9]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=1)
matrix = vectorizer.fit_transform([x for x in x_text_train + x_text_test])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [10]:
x_text_train, _, y_train_full, _ = train_test_split(x_text_train, y_train_full, test_size=0.995, random_state=42)
del _

In [11]:
len(x_text_train)

12500

In [12]:
sequence_length = 74
print('Maximum sequence length of train and test data:', sequence_length)

x_text_train_pad = pad_sentences(x_text_train, padding_word="<PAD/>", sequence_length=sequence_length)
x_text_test_pad = pad_sentences(x_text_test, padding_word="<PAD/>", sequence_length=sequence_length)

del x_text_train
del x_text_test

Maximum sequence length of train and test data: 74


In [15]:
len(x_text_train_pad)

12500

In [16]:
def get_predictions(model, x, word_vectors, vector_length):
    test_output = model(Variable(get_tweets_tensor(x, word_vectors, vector_length)))
    return torch.max(test_output, 1)[1].data.numpy().squeeze()

In [17]:
def get_predictions_tfidf(model, x, word_vectors, vector_length, tfidf):
    test_output = model(Variable(get_tweets_tensor_tfidf(x, word_vectors, vector_length, tfidf)))
    return torch.max(test_output, 1)[1].data.numpy().squeeze()

## Model 1

In [18]:
model1 = torch.load('./ensemble_models/model1.pth')

In [19]:
y_train_pred1 = get_predictions(model1, x_text_train_pad, word_vectors, vector_length)

In [20]:
accuracy_score(y_train_pred1, y_train_full)

0.88200000000000001

In [21]:
y_test_pred1 = get_predictions(model1, x_text_test_pad, word_vectors, vector_length)

In [22]:
del model1

## Model 2

In [23]:
x = np.empty((0,3))

In [24]:
y = np.array([1,2,3])
x = np.vstack([x, y])
x = np.vstack([x, y])

In [25]:
x

array([[ 1.,  2.,  3.],
       [ 1.,  2.,  3.]])

In [26]:
model2 = torch.load('./ensemble_models/model2.0.pth')
y_train_pred2_0 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_0 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [27]:
model2 = torch.load('./ensemble_models/model2.1.pth')
y_train_pred2_1 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_1 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [28]:
model2 = torch.load('./ensemble_models/model2.2.pth')
y_train_pred2_2 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_2 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [29]:
model2 = torch.load('./ensemble_models/model2.3.pth')
y_train_pred2_3 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_3 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [30]:
model2 = torch.load('./ensemble_models/model2.4.pth')
y_train_pred2_4 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_4 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [31]:
model2 = torch.load('./ensemble_models/model2.5.pth')
y_train_pred2_5 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_5 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [32]:
model2 = torch.load('./ensemble_models/model2.6.pth')
y_train_pred2_6 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_6 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [33]:
model2 = torch.load('./ensemble_models/model2.7.pth')
y_train_pred2_7 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_7 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [34]:
model2 = torch.load('./ensemble_models/model2.8.pth')
y_train_pred2_8 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_8 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [35]:
model2 = torch.load('./ensemble_models/model2.9.pth')
y_train_pred2_9 = get_predictions(model2, x_text_train_pad, word_vectors, vector_length)
y_test_pred2_9 = get_predictions(model2, x_text_test_pad, word_vectors, vector_length)
del model2

In [36]:
y_train_pred2 = np.vstack([y_train_pred2_0, y_train_pred2_1, y_train_pred2_2, \
                           y_train_pred2_3, y_train_pred2_4, y_train_pred2_5, \
                           y_train_pred2_6, y_train_pred2_7, y_train_pred2_8, y_train_pred2_9])

y_test_pred2 = np.vstack([y_test_pred2_0, y_test_pred2_1, y_test_pred2_2, \
                          y_test_pred2_3, y_test_pred2_4, y_test_pred2_5, \
                          y_test_pred2_6, y_test_pred2_7, y_test_pred2_8, y_test_pred2_9])

In [37]:
y_train_pred2 = np.median(y_train_pred2, axis=0)
y_test_pred2 = np.median(y_test_pred2, axis=0)

## Model 3

In [38]:
model3 = torch.load('./ensemble_models/model3.pth')

In [39]:
y_train_pred3 = get_predictions(model3, x_text_train_pad, word_vectors, vector_length)

In [40]:
accuracy_score(y_train_pred3, y_train_full)

0.87575999999999998

In [41]:
y_test_pred3 = get_predictions(model3, x_text_test_pad, word_vectors, vector_length)

In [42]:
del model3

## Model 4

In [43]:
model4 = torch.load('./ensemble_models/model4.pth')

In [44]:
y_train_pred4 = get_predictions_tfidf(model4, x_text_train_pad, word_vectors, vector_length, tfidf)

In [45]:
accuracy_score(y_train_pred4, y_train_full)

0.83320000000000005

In [46]:
y_test_pred4 = get_predictions_tfidf(model4, x_text_test_pad, word_vectors, vector_length, tfidf)

In [47]:
del model4

## Model 5

In [48]:
model5 = torch.load('./ensemble_models/model5.pth')

In [49]:
y_train_pred5 = get_predictions_tfidf(model5, x_text_train_pad, word_vectors, vector_length, tfidf)

In [50]:
accuracy_score(y_train_pred5, y_train_full)

0.78727999999999998

In [51]:
y_test_pred5 = get_predictions_tfidf(model5, x_text_test_pad, word_vectors, vector_length, tfidf)

In [52]:
del model5

In [258]:
model6 = torch.load('./ensemble_models/model6.pth')
y_train_pred6 = get_predictions(model6, x_text_train_pad, word_vectors, vector_length)
accuracy_score(y_train_pred6, y_train_full)
y_test_pred6 = get_predictions(model6, x_text_test_pad, word_vectors, vector_length)
del model6

## Ensemble

In [302]:
ensemble_train = np.vstack([y_train_pred1, y_train_pred2,  y_train_pred3, y_train_pred4, y_train_pred6]).T

In [303]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint as sp_randint

In [304]:
len(x_text_train_pad)

12500

In [305]:
int(np.sqrt(5))

2

In [317]:
clf = RandomForestClassifier(oob_score=True, criterion='entropy')

param_grid = {'n_estimators': sp_randint(10, 100),
#               'criterion': (['gini', 'entropy']),
              'max_depth': sp_randint(10, 90),
              'min_samples_split': sp_randint(100, 1000),
#               'min_samples_leaf': np.random.choice([1, 1000]),
#               'max_features': sp_randint(2, int(np.sqrt(5))),
#               'max_leaf_nodes': np.random.choice([None, 2, 100]),
              'random_state': sp_randint(0, 999),
#               'class_weight': np.random.choice([None, 'balanced'])
             }

rf_grid = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=100, cv=5, random_state=42)
rf_grid

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1937db278>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1937dbcc0>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1937db080>, 'random_state': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1937dbc18>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_sc

In [318]:
rf_grid.fit(ensemble_train, y_train_full)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1937db278>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1937dbcc0>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1937db080>, 'random_state': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1937dbc18>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_sc

In [319]:
print('Best hyperparameters:', rf_grid.best_params_)
best_rf = rf_grid.best_estimator_

Best hyperparameters: {'max_depth': 61, 'min_samples_split': 212, 'n_estimators': 71, 'random_state': 496}


In [320]:
# rf_grid.cv_results_['mean_test_score'].reshape((10,10))

In [330]:
ensemble_test = np.vstack([y_test_pred1, y_test_pred2,  y_test_pred3, y_test_pred4, y_test_pred6]).T

In [331]:
final_preds = best_rf.predict(ensemble_test)

In [332]:
final_preds[final_preds == 0] = -1

In [333]:
final_preds

array([-1, -1, -1, ..., -1,  1, -1])

In [337]:
final_preds.mean()

0.025000000000000001

In [338]:
best_rf.feature_importances_

array([ 0.1576496 ,  0.41565219,  0.29853216,  0.02369134,  0.10447471])

In [339]:
best_rf.oob_score_

0.90015999999999996

In [340]:
best_rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=61, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=212,
            min_weight_fraction_leaf=0.0, n_estimators=71, n_jobs=1,
            oob_score=True, random_state=496, verbose=0, warm_start=False)

In [341]:
0.89856000000000003

0.89856

In [342]:
create_csv_submission(np.arange(1,10001), final_preds, 'kaggle_ensemble.csv')