Set GPU use to dynamic

In [1]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

Mount Drive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/mads_thesis')
!pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/mads_thesis


Import libraries

In [3]:
import numpy as np
import sys
from keras import Model, layers
from keras.utils import pad_sequences
from gensim.models.keyedvectors import load_word2vec_format
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf

Set tensorflow to use gpu as much as possible

In [4]:
tf.test.gpu_device_name()
np.set_printoptions(threshold=sys.maxsize)

Create CNN model

In [5]:
def cnn_model(embeddings, model_args, train_x, train_control, train_y, dev_x, dev_control, dev_y):
    embedding_matrix = embeddings.vectors
    control_shape = train_control.shape[1]

    cnn_input = layers.Input(shape=(embeddings.vector_size,), name='cnn_input')
    control_input = layers.Input(shape=(control_shape,), name='control_input')

    embedding = layers.Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        input_length=embeddings.vector_size,
        weights=[embedding_matrix],
        trainable=False)(cnn_input)
    x = layers.Dropout(0.5)(embedding)
    x = layers.Conv1D(
        model_args['num_filters'], model_args['kernel_size'], activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    z = layers.Concatenate(axis=1)([x, control_input])
    z = layers.Dense(10, activation='relu')(z)
    z = layers.Dropout(0.5)(z)
    z = layers.Dense(1, activation='sigmoid')(z)

    model = Model(inputs=[cnn_input, control_input], outputs=[z])
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['binary_accuracy'])
    print(model.summary())

    history = model.fit(x= [train_x, train_control],  y = train_y,
                        epochs=model_args['epochs'],
                        verbose=True,
                        validation_data=([dev_x, dev_control], dev_y),
                        batch_size=model_args['batch_size'])
    tf.keras.utils.plot_model(model, to_file='cnn_model.png', show_shapes=True)
    return model

Load embedding library

In [6]:
google_embeddings = load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

Load and format data

In [9]:
train_data = pd.read_pickle('Video_Games_final_train.pkl.gz')
dev_data = pd.read_pickle('Video_Games_final_dev.pkl.gz')
test_data = pd.read_pickle('Video_Games_final_test.pkl.gz')
op_data = pd.read_pickle('Office_Products_final_half.pkl.gz')
tg_data = pd.read_pickle('Toys_&_Games_final_half.pkl.gz')

labels = {'Unhelpful': 0, 'Helpful': 1}
train_y = np.array([labels[i] for i in train_data['helpfulness']])
dev_y = np.array([labels[i] for i in dev_data['helpfulness']])
test_y = np.array([labels[i] for i in test_data['helpfulness']])
op_y = np.array([labels[i] for i in op_data['helpfulness']])
tg_y = np.array([labels[i] for i in tg_data['helpfulness']])

train_x = np.array(train_data['vectorized_text'].tolist())
dev_x = np.array(dev_data['vectorized_text'].tolist())
test_x = np.array(test_data['vectorized_text'].tolist())
op_x = np.array(op_data['vectorized_text'].tolist())
tg_x = np.array(tg_data['vectorized_text'].tolist())

train_control = np.array(train_data[['review_time_scl', 'rating_scl', 'richness_scl']])
dev_control = np.array(dev_data[['review_time_scl', 'rating_scl', 'richness_scl']])
test_control = np.array(test_data[['review_time_scl', 'rating_scl', 'richness_scl']])
op_control = np.array(op_data[['review_time_scl', 'rating_scl', 'richness_scl']])
tg_control = np.array(tg_data[['review_time_scl', 'rating_scl', 'richness_scl']])

Model Parameters

In [10]:
model_args = {
    'epochs': 20,
    'batch_size': 256,
    'num_filters': 128,
    'kernel_size': 7
}

Train Model

In [11]:
with tf.device(tf.test.gpu_device_name()):
    model = cnn_model(google_embeddings, model_args, train_x, train_control, train_y, dev_x, dev_control, dev_y)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 cnn_input (InputLayer)         [(None, 300)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 300, 300)     900000000   ['cnn_input[0][0]']              
                                                                                                  
 dropout (Dropout)              (None, 300, 300)     0           ['embedding[0][0]']              
                                                                                                  
 conv1d (Conv1D)                (None, 294, 128)     268928      ['dropout[0][0]']                
                                                                                              

Create Model Evaluation

In [12]:
def evaluate_model(model, test_x, test_control,test_y):
    predict_y = model.predict([test_x, test_control])
    bin_predict_y = np.round(predict_y)
    print(classification_report(test_y, bin_predict_y, ))

    deciles = pd.DataFrame([(p[0], t) for p,t in zip(predict_y, test_y)], columns=['Predicted', 'Observed'])
    deciles['Decile'] = pd.qcut(deciles['Predicted'], 10, labels=False)
    mean_actual = deciles['Observed'].mean()

    deciles_mean = deciles.groupby(['Decile']).agg(
        Predicted=('Predicted', 'mean'),
        Observed=('Observed', 'mean')).sort_values('Decile', ascending=False)

    deciles_mean['Lift'] = deciles_mean.apply(lambda row: row['Observed']/mean_actual, axis=1)

    tdl = deciles_mean['Lift'].iloc[0] / deciles_mean['Lift'].mean()
    print(f'\nTDL is {tdl}\n')

    gini_auc = roc_auc_score(test_y, predict_y)
    gini = gini_auc*2-1
    print(f'\nGINI is {gini}\n')

Evaluate Model

In [14]:
print('#### DEV EVALUATION ####')
evaluate_model(model, dev_x, dev_control, dev_y)
print('#### TEST EVALUATION ####')
evaluate_model(model, test_x, test_control, test_y)
print('#### ROBUST EVALUATION: Office Products ####')
evaluate_model(model, op_x, op_control, op_y)
print('#### ROBUST EVALUATION: Toys & Games ####')
evaluate_model(model, tg_x, tg_control, tg_y)

#### DEV EVALUATION ####
              precision    recall  f1-score   support

           0       0.67      0.84      0.75      1266
           1       0.83      0.67      0.74      1542

    accuracy                           0.74      2808
   macro avg       0.75      0.75      0.74      2808
weighted avg       0.76      0.74      0.74      2808


TDL is 1.7302992149311214


GINI is 0.689809094690427

#### TEST EVALUATION ####
              precision    recall  f1-score   support

           0       0.66      0.84      0.74      3164
           1       0.83      0.65      0.73      3855

    accuracy                           0.73      7019
   macro avg       0.74      0.74      0.73      7019
weighted avg       0.75      0.73      0.73      7019


TDL is 1.7455056576785366


GINI is 0.6733178543963299

#### ROBUST EVALUATION: Office Products ####
              precision    recall  f1-score   support

           0       0.31      0.74      0.44      1071
           1       0.94     