In [33]:
from model import UniSkip, Encoder
from data_loader import DataLoader
from vocab import load_dictionary
from config import *
from torch import nn
from torch.autograd import Variable
import torch
import pandas as pd
import random
from numpy import NaN
import os
import numpy as np
import string
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import Ridge
from scipy.stats import spearmanr, pearsonr

In [24]:
def RF_Regressor(X, y):
    #kfold
    kf = model_selection.KFold(n_splits=5) 
    kf.get_n_splits(X)
    #print(kf)
    #creating instance of RFRegressor 
    model1 = RandomForestRegressor(n_estimators=500,max_features='sqrt',n_jobs=-1,min_samples_leaf=60)#cross val split
    for train_index, test_index in kf.split(X):
        #print('TRAIN:', train_index, 'TEST:', test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]    
    #training
    model1.fit(X_train, y_train)
    #evaluating
    y_pred = model1.predict(X_test)
    #monotonic relationship as the relation between the variables is non linear
    spearman = spearmanr(y_test, y_pred)
    pearson = pearsonr(y_test, y_pred)
    print(f'Test data Spearman correlation: {spearman[0]:.3}')
    print(f'Test data pearson correlation: {pearson[0]:.3}')
    print('MSE')
    print(metrics.mean_squared_error(y_test, y_pred)) #mean square error
    print('R2')
    print(metrics.r2_score(y_test, y_pred)) #r2 score
    print('MAE')
    print(metrics.mean_absolute_error(y_test, y_pred)) #mae
    print('Variance Score')
    print(metrics.explained_variance_score(y_test, y_pred)) #mape

In [25]:
def ridge_regression(X, y):
    #kfold
    kf = model_selection.KFold(n_splits=5) 
    kf.get_n_splits(X)
    #print(kf)
    #cross val split with score and r2
    for train_index, test_index in kf.split(X):
    #print('TRAIN:', train_index, 'TEST:', test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
    model2 = Ridge(alpha=0.5)
    # Fit the model
    model2.fit(X_train, y_train)
    y_pred = model2.predict(X_test)
    
    spearman = spearmanr(y_test, y_pred)
    pearson = pearsonr(y_test, y_pred)
    print(f'Test data Spearman correlation: {spearman[0]:.3}')
    print(f'Test data pearson correlation: {pearson[0]:.3}')
    print('MSE')
    print(metrics.mean_squared_error(y_test, y_pred)) #mean square error
    print('R2')
    print(metrics.r2_score(y_test, y_pred)) #r2 score
    print('MAE')
    print(metrics.mean_absolute_error(y_test, y_pred)) #mae
    print('Variance Score')
    print(metrics.explained_variance_score(y_test, y_pred)) 

In [26]:
class UsableEncoder:

    def __init__(self, loc="./saved_models/skip-best"):
        print("Preparing the DataLoader. Loading the word dictionary")
        self.d = DataLoader(sentences=[''], word_dict=load_dictionary('./data/new_data.txt.pkl'))
        self.encoder = None

        print("Loading encoder from the saved model at {}".format(loc))
        model = UniSkip()
        model.load_state_dict(torch.load(loc, map_location=lambda storage, loc: storage))
        self.encoder = model.encoder
        if USE_CUDA:
            self.encoder.cuda(CUDA_DEVICE)

    def encode(self, text):
        def chunks(l, n):
            """Yield successive n-sized chunks from l."""
            for i in range(0, len(l), n):
                yield l[i:i + n]

        ret = []

        for chunk in chunks(text, 100):
            print("encoding chunk of size {}".format(len(chunk)))
            indices = [self.d.convert_sentence_to_indices(sentence) for sentence in chunk]
            indices = torch.stack(indices)
            indices, _ = self.encoder(indices)
            indices = indices.view(-1, self.encoder.thought_size)
            indices = indices.data.cpu().numpy()

            ret.extend(indices)
        ret = np.array(ret)

        return ret

In [27]:
t_data = UsableEncoder()

Preparing the DataLoader. Loading the word dictionary
Making reverse dictionary
Loading encoder from the saved model at ./saved_models/skip-best


In [110]:
cleaned_data = pd.read_csv('/Users/rithika/Documents/247ai/data/Serenata_x.csv', names=['Copy'], header=None)
cleaned_data = cleaned_data['Copy']

In [111]:
X = t_data.encode(cleaned_data)

encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
encoding chunk of size 100
e

In [112]:
result = pd.read_csv('/Users/rithika/Documents/247ai/data/Serenata_y.csv', names=['Clicks'], header=None)
result = result['Clicks']

In [113]:
y = np.array(result)

In [114]:
RF_Regressor(X, y)

Test data Spearman correlation: -0.0366
Test data pearson correlation: 0.0275
MSE
187.27770068995352
R2
-0.06346167286077575
MAE
5.644443405533249
Variance Score
-0.0014169070514087334


In [115]:
ridge_regression(X, y)

Test data Spearman correlation: 0.152
Test data pearson correlation: 0.00301
MSE
342.6185796682807
R2
-0.9455692084260989
MAE
12.130887224986036
Variance Score
-0.8589084411961652
