In [36]:
import pandas as pd
import random
import os
import numpy as np
import string
import re
import pickle
import time
import matplotlib.pyplot as plt
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import Imputer 
from sklearn.ensemble import RandomForestRegressor 
from sklearn import model_selection
from sklearn import metrics
from textblob import TextBlob, Word
from sklearn.externals import joblib
from scipy.stats import spearmanr, pearsonr
from sklearn.manifold import TSNE
%matplotlib inline
from bokeh.io import push_notebook, output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, LabelSet
import gensim
import splitter
from sklearn.linear_model import Ridge

In [2]:
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')

In [3]:
#loading googlenews vec
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/rithika/Documents/247ai/GoogleNews-vectors-negative300.bin', binary=True)

In [4]:
# make sure to use tensor flow 1.4.0 because the latest version(1.8.0) is not compatible with keras 2.1.5
# dont use conda for tf and k installation

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.recurrent import GRU
from keras.layers.embeddings import Embedding
from keras.layers import Input
from keras.layers.wrappers import TimeDistributed
from keras.layers.core import Dense, Activation, Flatten
from keras.utils.np_utils import np as np
from keras.models import Model
from keras import optimizers

Using TensorFlow backend.
  return f(*args, **kwds)


# preprocess data

In [6]:
#cleaning input data to get predictor values and target
def prepare_data(fname):
    filename = os.path.join('/Users/rithika/Documents/247ai/datasets', fname)
    if os.path.isfile(filename):
        dat = pd.read_csv(filename)
    else:
        print("no such file exists at this time")
    dat['Adcopy']=dat['Headline 1']+' '+dat['Headline 2']+' '+dat['Description']
    #dat['Adcopy'] = dat['Description']
    dat = dat[['Adcopy','Clicks']]
    dat = dat[pd.notnull(dat['Adcopy'])]
    dat['Adcopy'] = dat['Adcopy'].replace('http\S+|www.\S+', '', regex=True)
    dat['Adcopy'].replace('[™!®"#\'©()*+,-./:;<=>?@\&[\]^_`{|}~’”“′‘\\\%0123456789£]',' ',inplace=True,regex=True)
    dat['Adcopy'].replace('  ',' ',inplace=True,regex=True)
    dat['Adcopy'].replace('   ',' ',inplace=True,regex=True)
    dat['Adcopy'].replace('  ',' ',inplace=True,regex=True)
    dat['Adcopy'] = dat['Adcopy'].str.lower()
    dat['Adcopy'] = dat['Adcopy'].apply(lambda x: ' '.join([wordnet_lemmatizer.lemmatize(str(word)) for word in x.split()]))
    dat['Copy'] = dat['Adcopy'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    data = dat[['Copy','Clicks']]
    datum = data.groupby('Copy')['Clicks'].mean().reset_index()
    print(len(datum))
    return datum

In [7]:
#prints words from the dataset which is not present in google news vec(simply for reducing the runtime of preprocessing)
def no_present(da, model):
    #loading googlenews vec
    #model = gensim.models.KeyedVectors.load_word2vec_format('/Users/rithika/Documents/247ai/GoogleNews-vectors-negative300.bin', binary=True)
    nokey = list()
    for sentence in da:
        for word in sentence.split(' '): 
            if word not in model:
                if word not in nokey:
                    nokey.append(word)
    #print(nokey)
    #print(len(nokey))
    return nokey

In [8]:
#splits the compound words
def spli(row, nokey):
    if row in nokey:
        y = splitter.split(row) #compound word splitter
        if y != row and y != '':
            return ' '.join(y)
        else:
            wo = Word(row).correct() #spellcorrector
            return wo
    else:
        return row

In [9]:
#furthur cleans the data and returns the input and output values
def clean_dat(data,model):
    nokey = no_present(data['Copy'],model)
    data['Copy'] = data['Copy'].apply(lambda x: ' '.join([spli(str(word),nokey) for word in x.split()]))
    datu = data.groupby('Copy')['Clicks'].mean().reset_index()
    clean_d = datu['Copy'] #predictor values
    res = datu['Clicks'] #target
    print(len(clean_d))
    print(len(res))
    return clean_d, res, datu

In [10]:
# removes rows which have words from foreign language
def remove_lang(clean_d, datu, model):
    key_not = no_present(clean_d, model)
    ind = []
    count = 0
    for sen in clean_d:
        for word in sen.split(' '):
            if word in key_not:
                if word not in stop:
                    if count not in ind:
                        ind.append(count)
        count+=1
    cl_data = datu.drop(datu.index[ind])  
    cl_data['Copy'] = cl_data['Copy'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    #print(ind)
    clean_data = cl_data['Copy']
    result = cl_data['Clicks']
    print(len(clean_data))
    print(len(result))
    return clean_data, result

In [30]:
def RF_Regressor(X, y):
    #kfold
    kf = model_selection.KFold(n_splits=5) 
    kf.get_n_splits(X)
    #print(kf)
    #creating instance of RFRegressor 
    model1 = RandomForestRegressor(n_estimators=500,max_features='sqrt',n_jobs=-1,min_samples_leaf=60)#cross val split
    for train_index, test_index in kf.split(X):
        #print('TRAIN:', train_index, 'TEST:', test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]    
    #training
    model1.fit(X_train, y_train)
    #evaluating
    y_pred = model1.predict(X_test)
    #monotonic relationship as the relation between the variables is non linear
    spearman = spearmanr(y_test, y_pred)
    pearson = pearsonr(y_test, y_pred)
    print(f'Test data Spearman correlation: {spearman[0]:.3}')
    print(f'Test data pearson correlation: {pearson[0]:.3}')
    print('MSE')
    print(metrics.mean_squared_error(y_test, y_pred)) #mean square error
    print('R2')
    print(metrics.r2_score(y_test, y_pred)) #r2 score
    print('MAE')
    print(metrics.mean_absolute_error(y_test, y_pred)) #mae
    print('Variance Score')
    print(metrics.explained_variance_score(y_test, y_pred)) #mape

In [31]:
def ridge_regression(X, y):
    #kfold
    kf = model_selection.KFold(n_splits=5) 
    kf.get_n_splits(X)
    #print(kf)
    #cross val split with score and r2
    for train_index, test_index in kf.split(X):
    #print('TRAIN:', train_index, 'TEST:', test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
    model2 = Ridge(alpha=0.5)
    # Fit the model
    model2.fit(X_train, y_train)
    y_pred = model2.predict(X_test)
    
    spearman = spearmanr(y_test, y_pred)
    pearson = pearsonr(y_test, y_pred)
    print(f'Test data Spearman correlation: {spearman[0]:.3}')
    print(f'Test data pearson correlation: {pearson[0]:.3}')
    print('MSE')
    print(metrics.mean_squared_error(y_test, y_pred)) #mean square error
    print('R2')
    print(metrics.r2_score(y_test, y_pred)) #r2 score
    print('MAE')
    print(metrics.mean_absolute_error(y_test, y_pred)) #mae
    print('Variance Score')
    print(metrics.explained_variance_score(y_test, y_pred)) 

In [280]:
data = prepare_data('Wargaming_2018-06-25_2018-07-24.csv')

362


In [281]:
key1 = list()
for sentence in data['Copy']:
    for word in sentence.split(' '): 
        if word not in model:
            if word not in key1:
                key1.append(word)
print(len(key1))
print(key1)

392
['batailles', 'historiques', 'obtenez', 'téléchargez', 'combattez', 'büyük', 'savaşları', 'burada', 'indir', 'gerçek', 'gücü', 'tadın', 'savaşında', 'şeref', 'kazanın', 'hemen', 'savaşmaya', 'başlayın', 'légendaires', 'jouez', 'blindés', 'améliorés', 'engins', 'revisitées', 'réalistes', 'superbe', 'dizaines', 'maîtriser', 'choisissez', 'vraies', 'superbes', 'graphismes', 'devenez', 'centaines', 'véhicules', 'quête', 'ekte', 'utfordring', 'ferdigheter', 'før', 'seier', 'episke', 'tankskamper', 'utrolig', 'grafikk', 'episk', 'lydspor', 'fuelled', 'wwii', 'samle', 'tankskrigføring', 'aldri', 'kamper', 'hundrevis', 'spille', 'krigføring', 'taktikken', 'ditt', 'gratisspillet', 'trenger', 'forbedrede', 'krigsmaskiner', 'gjenskapte', 'oppdatering', 'savaşı', 'edin', 'hiç', 'görmediğiniz', 'şekilde', 'yüzlerce', 'tankla', 'kalitesinde', 'savaşın', 'dusinvis', 'mestre', 'tankmen', 'gratuits', 'luttez', 'gratuitement', 'prenez', 'perdez', 'fantasyspill', 'begynn', 'kampen', 'historiske', 'op

In [264]:
cleaned_data = data['Copy']
result = data['Clicks']

In [282]:
#split and correct
clean_d, res, datu = clean_dat(data,model)

362
362


In [283]:
key2 = list()
for sentence in clean_d:
    for word in sentence.split(' '): 
        if word not in model:
            if word not in key2:
                key2.append(word)
print(len(key2))
print(key2)

302
['arshin', 'voyna', 'gelişmiş', 'araçları', 'güncellemede', 'téléchargez', 'büyük', 'savaşları', 'a', 'gücü', 'savaşında', 'savaşmaya', 'başlayın', 'légendaires', 'améliorés', 'revisitées', 'réalistes', 'maîtriser', 'choisissez', 'devriez', 'utfordring', 'sevier', 'fiske', 'utrolig', 'grafikk', 'lydspor', 'tankskrigføring', 'kammer', 'krigsmaskiner', 'gjenskapte', 'krigføring', 'taktikken', 'görmediğiniz', 'şekilde', 'yüzlerce', 'savaşın', 'gratuitement', 'of', 'самая', 'онлайн', 'игра', 'время', 'играть', 'более', 'танков', 'современная', 'графика', 'реалистичные', 'карты', 'бой', 'to', 'historiebøkene', 'хардкору', 'сражайся', 'играй', 'картах', 'новой', 'графикой', 'под', 'новый', 'саундтрек', 'krigføringsspill', 'nyoppdatering', 'utilised', 'piquet', 'новому', 'оцени', 'новую', 'графику', 'музыку', 'окунись', 'танковые', 'сражения', 'получи', 'яркие', 'впечатления', 'бесплатная', 'танковая', 'миллионов', 'игроков', 'уже', 'играют', 'присоединяйся', 'к', 'боевому', 'братству', '

In [186]:
cleaned_data = clean_d
result = res

In [284]:
cleaned_data, result = remove_lang(clean_d, datu, model)

266
266


In [285]:
key3 = list()
for sentence in cleaned_data:
    for word in sentence.split(' '): 
        if word not in model:
            if word not in key3:
                key3.append(word)
print(len(key3))
print(key3)

0
[]


In [286]:
len(cleaned_data)

266

In [287]:
len(result)

266

In [288]:
cleaned_data = cleaned_data.tolist() #predictor values

# preprocess for keras

In [289]:
num_words=2000
maxlen=20
tokenizer = Tokenizer(num_words = num_words, split=' ')
tokenizer.fit_on_texts(cleaned_data)
seqs = tokenizer.texts_to_sequences(cleaned_data)
pad_seqs = []
for i in seqs:
    if len(i)>4:
        pad_seqs.append(i)
pad_seqs = pad_sequences(pad_seqs,maxlen)

In [290]:
pad_seqs

array([[  0,   0,  45, ...,  20,   9,   8],
       [  0,   0,  45, ...,  13,  26,  29],
       [  0,  45,   2, ...,  20,   9,   8],
       ..., 
       [  0,   0,   0, ...,  10,  11,  33],
       [  0,   0,   0, ...,  10,   9,   8],
       [  0,   0,   0, ..., 225,   1,  41]], dtype=int32)

In [291]:
pad_seqs.shape

(266, 20)

In [292]:
x_skip = []
y_before = []
y_after = []
for i in tqdm(range(1,len(seqs)-1)):
    if len(seqs[i])>4:
        x_skip.append(pad_seqs[i].tolist())
        y_before.append(pad_seqs[i-1].tolist())
        y_after.append(pad_seqs[i+1].tolist())
x_before = np.matrix([[0]+i[:-1] for i in y_before])
x_after = np.matrix([[0]+i[:-1] for i in y_after])
x_skip = np.matrix(x_skip)
y_before = np.matrix(y_before)
y_after = np.matrix(y_after)

100%|██████████| 264/264 [00:00<00:00, 90568.97it/s]


In [293]:
x_skip.shape  

(264, 20)

In [294]:
y_before.shape 

(264, 20)

In [295]:
y_after.shape 

(264, 20)

# seq2seq architecture to extract sentence embeddings

In [296]:
embed_dim = 150
latent_dim = 128
batch_size = 64
# Encoder Model 
encoder_inputs = Input(shape=(maxlen,), name='Enc-In')
emb_layer = Embedding(num_words, embed_dim,input_length = maxlen, name='Embedding', mask_zero=False)
x = emb_layer(encoder_inputs)
_, state_h = GRU(latent_dim, return_state=True, name='Enc-Last-GRU')(x)
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Enc-Model')
seq2seq_encoder_out = encoder_model(encoder_inputs)
# Decoder Model 
decoder_inputs_before = Input(shape=(None,), name='Dec-In-before')  # for teacher forcing
dec_emb_before = emb_layer(decoder_inputs_before)
decoder_gru_before = GRU(latent_dim, return_state=True, return_sequences=True, name='Dec-GRU-before')
decoder_gru_output_before, _ = decoder_gru_before(dec_emb_before, initial_state=seq2seq_encoder_out)
decoder_dense_before = Dense(num_words, activation='softmax', name='Final-Out-Dense-before')
decoder_outputs_before = decoder_dense_before(decoder_gru_output_before)
decoder_inputs_after = Input(shape=(None,), name='Dec-In-after')  # for teacher forcing
dec_emb_after = emb_layer(decoder_inputs_after)
decoder_gru_after = GRU(latent_dim, return_state=True, return_sequences=True, name='Dec-GRU-after')
decoder_gru_output_after, _ = decoder_gru_after(dec_emb_after, initial_state=seq2seq_encoder_out)
decoder_dense_after = Dense(num_words, activation='softmax', name='Final-Out-Dense-after')
decoder_outputs_after = decoder_dense_after(decoder_gru_output_after)
# Seq2Seq Model 
seq2seq_Model = Model([encoder_inputs, decoder_inputs_before,decoder_inputs_after], [decoder_outputs_before,decoder_outputs_after])
seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')
seq2seq_Model.summary()
history = seq2seq_Model.fit([x_skip,x_before, x_after], [np.expand_dims(y_before, -1),np.expand_dims(y_after, -1)],
          batch_size=batch_size,
          epochs=10,
          validation_split=0.12)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Dec-In-before (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Enc-In (InputLayer)             (None, 20)           0                                            
__________________________________________________________________________________________________
Dec-In-after (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
Embedding (Embedding)           (None, 20, 150)      300000      Dec-In-before[0][0]              
                                                                 Dec-In-after[0][0]               
__________

# Feature extraction

In [297]:
sen_rep = tokenizer.texts_to_sequences(cleaned_data)
sen_rep = pad_sequences(sen_rep,maxlen=maxlen)
x = encoder_model.predict(sen_rep)

In [298]:
y = np.array(result)

# evaluation

In [299]:
RF_Regressor(x, y)

Test data Spearman correlation: -0.0293
Test data pearson correlation: 0.00497
MSE
188.082230858
R2
-0.0225713641569
MAE
8.27915081639
Variance Score
-0.00789840212195


In [300]:
ridge_regression(x, y)

Test data Spearman correlation: -0.13
Test data pearson correlation: -0.0162
MSE
186.976289043
R2
-0.0165585450529
MAE
8.30819606191
Variance Score
-0.00211053468485


In [28]:
#cross val score
#scores = model_selection.cross_val_score(model1, x, y, cv = 5, scoring ='neg_mean_squared_error')
#print(scores)

[-0.66155065 -1.26285795 -0.76557002 -0.33289663 -0.77377529]


# results on unseen data

In [29]:
unseen_data = ['get instant decision']
unseen_data1 = ['decision get instant']

In [36]:
#extract feature 
sen_rep1 = tokenizer.texts_to_sequences(unseen_data)
sen_rep1 = pad_sequences(sen_rep1,maxlen=maxlen)
new_v = encoder_model.predict(sen_rep1)

In [37]:
pred = model1.predict(new_v)

In [38]:
pred.tolist()

[12.13641402192663]

In [39]:
#extract feature 
sen_rep2 = tokenizer.texts_to_sequences(unseen_data1)
sen_rep2 = pad_sequences(sen_rep2,maxlen=maxlen)
new_v1 = encoder_model.predict(sen_rep2)

In [40]:
pred1 = model1.predict(new_v1)

In [41]:
pred1.tolist()

[2.699305266855843]