<a href="https://colab.research.google.com/github/stazam/DTSE-project/blob/main/DTSE_steam_data_reviews_model_building.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Model bulding**

We will try to use various models for user_review category prediction. Types of models which we will consider:

1. Bidirectional LSTM with embedding layer 
2. Bidirectional LSTM + CNN layer with embedding layer 
3. BERT - transformer for text classification with pretrained embeddings. 

We also try this models with and without removing stop words and lemmatizational process.


In [1]:
import tensorflow as tf
import keras
import sys
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import metrics

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

from keras import applications
from keras.models import Sequential
from keras.layers import Dense, Embedding, Bidirectional
from keras.layers import Convolution2D, MaxPooling2D,BatchNormalization,GlobalAveragePooling1D, Flatten, Dropout
from keras import optimizers

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
sys.path.append('/content/drive/MyDrive/DTSE-data/Python-files')
from help_functions import *

In [3]:
df_train_merged = pd.read_pickle("/content/drive/MyDrive/DTSE-data/data-files/df_train_merged.pkl")
df_test_merged = pd.read_pickle("/content/drive/MyDrive/DTSE-data/data-files/df_test_merged.pkl")

In [5]:
df_train_merged.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...


# **Bidirectional LSTM**

removing stop words, lemmatization



In [42]:
skuska = df_train_merged.title + df_train_merged.developer + + df_train_merged.publisher + df_train_merged.tags.apply(lambda x: x[1:-1]) + df_train_merged.overview + df_train_merged.user_review

In [24]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [33]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
 

def process_test(x):
  
  lemmatizer = WordNetLemmatizer()
  
  x = x.replace("'re",'are')
  X = X.replace("'s",'is')

  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(x)

  filtered_sentence = [lemmatizer.lemmatize(w) for w in word_tokens if not w.lower() in stop_words]
  
  return " ".join(filtered_sentence)

In [43]:
skuska

0        Spooky's Jump Scare MansionLag Studios Lag Stu...
1        Spooky's Jump Scare MansionLag Studios Lag Stu...
2        Spooky's Jump Scare MansionLag Studios Lag Stu...
3        Spooky's Jump Scare MansionLag Studios Lag Stu...
4        Spooky's Jump Scare MansionLag Studios Lag Stu...
                               ...                        
17489    EverQuest IIDaybreak Game Company Daybreak Gam...
17490    EverQuest IIDaybreak Game Company Daybreak Gam...
17491    EverQuest IIDaybreak Game Company Daybreak Gam...
17492    EverQuest IIDaybreak Game Company Daybreak Gam...
17493    EverQuest IIDaybreak Game Company Daybreak Gam...
Length: 17494, dtype: object

In [None]:
df_train_merged['text'] = df_train_merged.tags.apply(lambda x: x[1:-1]) + df_train_merged.overview + df_train_merged.user_review

X = df_train_merged
y = X.user_suggestion.values
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train_NN = X_train.text.values
X_test_NN = X_test.text.values

In [None]:
print("The longest sequence is {} words long.".format(max([len(x) for x in X_train_NN])))
pd.DataFrame([len(x) for x in X_train_NN]).describe()

In [None]:
max_length = 5000
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
vocab_size = 10000
#training_size = 20000

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train_NN)

print('Vocabulary is {} words large'.format(len(tokenizer.word_index)))

In [None]:
sorted(tokenizer.word_counts.items(), key = lambda t: t[1])[47000:]

In [None]:
X_train_NN = tokenizer.texts_to_sequences(X_train_NN)
X_train_padded_NN = np.array(pad_sequences(X_train_NN, maxlen=max_length, padding=padding_type, truncating=trunc_type))

X_test_NN = tokenizer.texts_to_sequences(X_test_NN)
X_test_padded_NN = np.array(pad_sequences(X_test_NN, maxlen=max_length, padding=padding_type, truncating=trunc_type))

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model = tf.keras.Sequential()

model.add(Embedding(vocab_size,240, input_length = max_length))
model.add(Bidirectional(keras.layers.LSTM(64, return_sequences=True)))
model.add(Bidirectional(keras.layers.LSTM(32, return_sequences=True)))
model.add(Flatten()),
Dropout(0.5),
model.add(Dense(1,activation = 'sigmoid'))


model.compile(loss = 'binary_crossentropy',optimizer='adam',metrics = METRICS)

model.summary()

In [None]:
history = model.fit(X_train_padded_NN, y_train, epochs=1, validation_data=(X_test_padded_NN, y_test), verbose = 1)

In [None]:
X_train_pred = model.predict(X_train_padded_NN)
X_test_pred = model.predict(X_test_padded_NN)

In [None]:
from sklearn.metrics import confusion_matrix

#first is category 0 then catgory 1. Check:
#np.logical_and((y_test ==1),np.array([x[0] for x in (round(X_test_p) == 1).values])).sum()

results = confusion_matrix(y_test, round(X_test_p))
print(results)

import seaborn as sns

ax = sns.heatmap(results, annot=True, cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
from sklearn.impute import KNNImputer
import numpy as np

def knn_inputer(X):

    X[X.isnull()] = np.nan

    imputer = KNNImputer(n_neighbors=2)
    X_im = imputer.fit_transform(X)
    return pd.DataFrame(X_im, columns = list(X.columns))

def process_dat(X):
  
  X_1 = X.loc[:,['title','developer','publisher']]
  X_1 = pd.get_dummies(X_1)
  X_1 = pd.concat([X_1,X.loc[:,['review_id','year']]], axis = 1)

  return X_1

X_trainm = process_dat(X_train)
X_trainm = knn_inputer(X_trainm)
X_trainm['predictions'] = [x[0] for x in X_train_pred.tolist()]

X_testm = process_dat(X_test)
X_testm = knn_inputer(X_testm)
X_testm['predictions'] = [x[0] for x in X_test_pred.tolist()]

print(X_trainm.isnull().sum())
print(X_testm.isnull().sum())