In [1]:
import pandas as pd
import random
import os
import numpy as np
import string
import re
import pickle
import time
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import Imputer 
from sklearn.ensemble import RandomForestRegressor 
from sklearn import model_selection
from sklearn import metrics
from textblob import TextBlob, Word
from sklearn.externals import joblib
from scipy.stats import spearmanr, pearsonr
from sklearn.manifold import TSNE
%matplotlib inline
from bokeh.io import push_notebook, output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, LabelSet
import gensim
import splitter
from sklearn.linear_model import Ridge

In [2]:
from xgboost import XGBRegressor

In [2]:
import autosklearn.regression

In [3]:
import torch

In [4]:
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')

In [5]:
#loading googlenews vec
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/rithika/Documents/247ai/GoogleNews-vectors-negative300.bin', binary=True)

In [6]:
#cleaning input data to get predictor values and target
def prepare_data(fname):
    filename = os.path.join('/Users/rithika/Documents/247ai/datasets', fname)
    if os.path.isfile(filename):
        dat = pd.read_csv(filename)
    else:
        print("no such file exists at this time")
    dat['Adcopy']=dat['Headline 1']+' '+dat['Headline 2']+' '+dat['Description']
    #dat['Adcopy'] = dat['Description']
    dat = dat[['Adcopy','Clicks']]
    dat = dat[pd.notnull(dat['Adcopy'])]
    dat['Adcopy'] = dat['Adcopy'].replace('http\S+|www.\S+', '', regex=True)
    dat['Adcopy'].replace('[™!®"#\'©()*+,-./:;<=>?@\&[\]^_`{|}~’”“′‘\\\%0123456789£]',' ',inplace=True,regex=True)
    dat['Adcopy'].replace('  ',' ',inplace=True,regex=True)
    dat['Adcopy'].replace('   ',' ',inplace=True,regex=True)
    dat['Adcopy'].replace('  ',' ',inplace=True,regex=True)
    dat['Adcopy'] = dat['Adcopy'].str.lower()
    dat['Adcopy'] = dat['Adcopy'].apply(lambda x: ' '.join([wordnet_lemmatizer.lemmatize(str(word)) for word in x.split()]))
    dat['Copy'] = dat['Adcopy'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    data = dat[['Copy','Clicks']]
    datum = data.groupby('Copy')['Clicks'].mean().reset_index()
    print(len(datum))
    return datum

In [7]:
#prints words from the dataset which is not present in google news vec(simply for reducing the runtime of preprocessing)
def no_present(da, model):
    #loading googlenews vec
    #model = gensim.models.KeyedVectors.load_word2vec_format('/Users/rithika/Documents/247ai/GoogleNews-vectors-negative300.bin', binary=True)
    nokey = list()
    for sentence in da:
        for word in sentence.split(' '): 
            if word not in model:
                if word not in nokey:
                    nokey.append(word)
    #print(nokey)
    #print(len(nokey))
    return nokey

In [8]:
#splits the compound words
def spli(row, nokey):
    if row in nokey:
        y = splitter.split(row) #compound word splitter
        if y != row and y != '':
            return ' '.join(y)
        else:
            wo = Word(row).correct() #spellcorrector
            return wo
    else:
        return row

In [9]:
#furthur cleans the data and returns the input and output values
def clean_dat(data,model):
    nokey = no_present(data['Copy'],model)
    data['Copy'] = data['Copy'].apply(lambda x: ' '.join([spli(str(word),nokey) for word in x.split()]))
    datu = data.groupby('Copy')['Clicks'].mean().reset_index()
    clean_d = datu['Copy'] #predictor values
    res = datu['Clicks'] #target
    print(len(clean_d))
    print(len(res))
    return clean_d, res, datu

In [10]:
# removes rows which have words from foreign language
def remove_lang(clean_d, datu, model):
    key_not = no_present(clean_d, model)
    ind = []
    count = 0
    for sen in clean_d:
        for word in sen.split(' '):
            if word in key_not:
                if word not in stop:
                    if count not in ind:
                        ind.append(count)
        count+=1
    cl_data = datu.drop(datu.index[ind])  
    cl_data['Copy'] = cl_data['Copy'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    #print(ind)
    clean_data = cl_data['Copy']
    result = cl_data['Clicks']
    print(len(clean_data))
    print(len(result))
    return clean_data, result

In [11]:
data = prepare_data('HP_2018-06-24_2018-07-23.csv')

1413


In [14]:
#split and correct
clean_d, res, datu = clean_dat(data,model)

1413
1413


In [15]:
cleaned_data, result = remove_lang(clean_d, datu, model)

1412
1412


In [16]:
from models import InferSent
V = 2
MODEL_PATH = '/Users/rithika/Documents/InferSent/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = '/Users/rithika/Documents/InferSent/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)
infersent.build_vocab(cleaned_data, tokenize=False)
embeddings = infersent.encode(cleaned_data, tokenize=False) #True)
#print('nb sentences encoded : {0}'.format(len(embeddings)))
X = embeddings
y = np.array(result)

Found 1201(/1201) words with w2v vectors
Vocab size : 1201


In [17]:
#y = np.array(result)

In [19]:
#y[np.isnan(y)] = 0

In [21]:
X.shape

(1412, 4096)

In [22]:
y.shape

(1412,)

In [16]:
def RF_Regressor(X, y):
    #kfold
    kf = model_selection.KFold(n_splits=5) 
    kf.get_n_splits(X)
    #print(kf)
    #creating instance of RFRegressor 
    model1 = RandomForestRegressor(n_estimators=500,max_features='sqrt',n_jobs=-1,min_samples_leaf=60)#cross val split
    for train_index, test_index in kf.split(X):
        #print('TRAIN:', train_index, 'TEST:', test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]    
    #training
    model_reg = model1.fit(X_train, y_train)
   
    #evaluating
    y_pred = model1.predict(X_test)
    #monotonic relationship as the relation between the variables is non linear
    spearman = spearmanr(y_test, y_pred)
    pearson = pearsonr(y_test, y_pred)
    print(f'Test data Spearman correlation: {spearman[0]:.3}')
    print(f'Test data pearson correlation: {pearson[0]:.3}')
    print('MSE')
    print(metrics.mean_squared_error(y_test, y_pred)) #mean square error
    print('R2')
    print(metrics.r2_score(y_test, y_pred)) #r2 score
    print('MAE')
    print(metrics.mean_absolute_error(y_test, y_pred)) #mae
    print('Variance Score')
    print(metrics.explained_variance_score(y_test, y_pred)) #mape
    
    return model_reg

In [17]:
model_reg = RF_Regressor(X, y)

Test data Spearman correlation: 0.363
Test data pearson correlation: 0.271
MSE
0.0853056340767
R2
0.064610387821
MAE
0.181031082325
Variance Score
0.0715130935883


In [23]:
kf = model_selection.KFold(n_splits=5) 
kf.get_n_splits(X)
    #print(kf)
    #creating instance of RFRegressor 
model1 = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=120,
        per_run_time_limit=30,
        tmp_folder='/tmp/autosklearn_regression_example_tmp',
        output_folder='/tmp/autosklearn_regression_example_out', 
    )
#model1 = XGBRegressor(n_estimators=500,max_features='sqrt',n_jobs=-1,min_samples_leaf=60)#cross val split
for train_index, test_index in kf.split(X):
    #print('TRAIN:', train_index, 'TEST:', test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]    
#training
model_reg = model1.fit(X_train, y_train)

#evaluating
y_pred = model1.predict(X_test)
#monotonic relationship as the relation between the variables is non linear
spearman = spearmanr(y_test, y_pred)
pearson = pearsonr(y_test, y_pred)
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data pearson correlation: {pearson[0]:.3}')
print('MSE')
print(metrics.mean_squared_error(y_test, y_pred)) #mean square error
print('R2')
print(metrics.r2_score(y_test, y_pred)) #r2 score
print('MAE')
print(metrics.mean_absolute_error(y_test, y_pred)) #mae
print('Variance Score')
print(metrics.explained_variance_score(y_test, y_pred)) #mape

  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)




  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)




  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)




  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)




  Y_train_pred = np.nanmean(Y_train_pred_full, axis=0)


Test data Spearman correlation: 0.199
Test data pearson correlation: 0.244
MSE
1903586.39665
R2
0.00733944417413
MAE
223.734656935
Variance Score
0.00859159023699


In [25]:
model_reg.cv_results_

{'mean_test_score': array([ 0.        , -0.72221672, -0.60320736,  0.        , -5.33345774,
         0.03341018,  0.        ]),
 'mean_fit_time': array([ 30.00977492,  19.44862795,   1.62761712,  30.01087308,
          1.68943119,   1.946347  ,  19.01108503]),
 'params': [{'categorical_encoding:__choice__': 'one_hot_encoding',
   'imputation:strategy': 'mean',
   'preprocessor:__choice__': 'no_preprocessing',
   'regressor:__choice__': 'random_forest',
   'rescaling:__choice__': 'standardize',
   'categorical_encoding:one_hot_encoding:use_minimum_fraction': 'True',
   'regressor:random_forest:bootstrap': 'True',
   'regressor:random_forest:criterion': 'mse',
   'regressor:random_forest:max_depth': 'None',
   'regressor:random_forest:max_features': 1.0,
   'regressor:random_forest:max_leaf_nodes': 'None',
   'regressor:random_forest:min_impurity_decrease': 0.0,
   'regressor:random_forest:min_samples_leaf': 1,
   'regressor:random_forest:min_samples_split': 2,
   'regressor:random_fores

In [27]:
model_reg.show_models()

"[(1.000000, SimpleRegressionPipeline({'categorical_encoding:__choice__': 'no_encoding', 'imputation:strategy': 'median', 'preprocessor:__choice__': 'fast_ica', 'regressor:__choice__': 'liblinear_svr', 'rescaling:__choice__': 'normalize', 'preprocessor:fast_ica:algorithm': 'deflation', 'preprocessor:fast_ica:fun': 'cube', 'preprocessor:fast_ica:whiten': 'True', 'regressor:liblinear_svr:C': 0.03582622697487416, 'regressor:liblinear_svr:dual': 'False', 'regressor:liblinear_svr:epsilon': 0.0073192289955103865, 'regressor:liblinear_svr:fit_intercept': 'True', 'regressor:liblinear_svr:intercept_scaling': 1, 'regressor:liblinear_svr:loss': 'squared_epsilon_insensitive', 'regressor:liblinear_svr:tol': 0.0013175607858941854, 'preprocessor:fast_ica:n_components': 119},\ndataset_properties={\n  'task': 4,\n  'sparse': False,\n  'multilabel': False,\n  'multiclass': False,\n  'target_type': 'regression',\n  'signed': False})),\n]"

# Unseen data

In [223]:
unseen_data = ['serenata flower uk free flower gift delivery order pm weekday pm weekend free next day delivery day wk']
unseen_data1 = ['free flower gift delivery serenata flower uk order pm weekday pm weekend free next day delivery day wk']


In [224]:
new_v = infersent.encode(unseen_data, tokenize=False)
pred = model_reg.predict(new_v)
print(pred.tolist())
new_v1 = infersent.encode(unseen_data1, tokenize=False)
pred1 = model_reg.predict(new_v1)
print(pred1.tolist())

[0.12315510593722102]
[0.1191602826834383]


In [225]:
unseen_data = ['flower delivery free day week year around order pm free next day delivery uk best reviewed online florist']
unseen_data1 = ['flower delivery costless day week year around order pm free next day delivery uk best reviewed online florist']


In [226]:
new_v = infersent.encode(unseen_data, tokenize=False)
pred = model_reg.predict(new_v)
print(pred.tolist())
new_v1 = infersent.encode(unseen_data1, tokenize=False)
pred1 = model_reg.predict(new_v1)
print(pred1.tolist())

[0.09888561200462491]
[0.10159159526953461]


In [227]:
unseen_data = ['serenata flowers uk top reviewed uk online flower order pm weekday pm weekend free next day delivery']
unseen_data1 = ['top reviewed uk online flower serenata flowers uk order pm weekday pm weekend free next day delivery']


In [228]:
new_v = infersent.encode(unseen_data, tokenize=False)
pred = model_reg.predict(new_v)
print(pred.tolist())
new_v1 = infersent.encode(unseen_data1, tokenize=False)
pred1 = model_reg.predict(new_v1)
print(pred1.tolist())

[0.11310536117895052]
[0.11783531540165926]


In [229]:
unseen_data = ['free next day flower delivery day wk including sunday order pm weekday pm weekend free next day delivery']
unseen_data1 = ['next day flower delivery free day wk including sunday order pm weekday pm weekend free next day delivery']


In [230]:
new_v = infersent.encode(unseen_data, tokenize=False)
pred = model_reg.predict(new_v)
print(pred.tolist())
new_v1 = infersent.encode(unseen_data1, tokenize=False)
pred1 = model_reg.predict(new_v1)
print(pred1.tolist())

[0.11566598563481041]
[0.11614226350752684]


In [231]:
unseen_data = ['serenata flower uk free flower gift delivery order pm weekday pm weekend free next day delivery day wk']
unseen_data1 = ['free flower gift delivery serenata flower uk buy pm weekday pm weekend free next day delivery day wk']


In [232]:
new_v = infersent.encode(unseen_data, tokenize=False)
pred = model_reg.predict(new_v)
print(pred.tolist())
new_v1 = infersent.encode(unseen_data1, tokenize=False)
pred1 = model_reg.predict(new_v1)
print(pred1.tolist())

[0.12315510593722101]
[0.12047514624688242]


In [233]:
unseen_data = ['flower uk free flower delivery day wk order pm free next day delivery uk best reviewed online florist']
unseen_data1 = ['flower united kingdom free flower delivery day wk order pm free next day delivery uk best reviewed online florist']  


In [234]:
new_v = infersent.encode(unseen_data, tokenize=False)
pred = model_reg.predict(new_v)
print(pred.tolist())
new_v1 = infersent.encode(unseen_data1, tokenize=False)
pred1 = model_reg.predict(new_v1)
print(pred1.tolist())

[0.11250085054271129]
[0.1236058772481419]


In [235]:
unseen_data = ['flower delivery free day week week year order pm weekday pm sun pm sat free next day delivery']
unseen_data1 = ['complimentary flower delivery day week week year free next day delivery buy pm weekday pm sun pm sat']


In [236]:
new_v = infersent.encode(unseen_data, tokenize=False)
pred = model_reg.predict(new_v)
print(pred.tolist())
new_v1 = infersent.encode(unseen_data1, tokenize=False)
pred1 = model_reg.predict(new_v1)
print(pred1.tolist())

[0.11881344865407327]
[0.13275670325946537]


In [237]:
unseen_data = ['free next day flower delivery day per week incl sunday order pm weekday pm weekend free next day flower delivery uk'] 
unseen_data1 = ['free next day flower delivery day per week incl weekend order pm weekday pm weekend free next day flower delivery uk']

In [238]:
new_v = infersent.encode(unseen_data, tokenize=False)
pred = model_reg.predict(new_v)
print(pred.tolist())
new_v1 = infersent.encode(unseen_data1, tokenize=False)
pred1 = model_reg.predict(new_v1)
print(pred1.tolist())

[0.13085728985341927]
[0.12997964172876414]
