### Section B CA1 RNN

Name: Jovan Heng Ghim Hong

Class: DAAA/2B/22

Admin No: 2401418

In [1]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [None]:
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import warnings
import random
import re
import math

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense , Dropout, LSTM, GRU, SimpleRNN, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize.treebank import TreebankWordDetokenizer
from scipy.sparse  import hstack

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)
warnings.filterwarnings('ignore')

KeyboardInterrupt: 

In [3]:
df = pd.read_csv(os.path.join('datasets' , 'Movie reviews.csv'))

##### Basic Data Exploration

In [4]:
df

Unnamed: 0,Review,Score,"Are there ways for you to generate more data? Spliting up sentences, would that help?",Language
0,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.1,,Malay
1,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.9,,Malay
2,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.5,,Malay
3,"Sayang sekali, pelakon tidak memberikan persem...",0.8,,Malay
4,Jalan cerita yang kompleks dan penuh emosi. Su...,0.2,,Malay
...,...,...,...,...
522,Pening,0.7,,Malay
523,Berkesan,0.2,,Malay
524,Mengujakan,0.1,,Malay
525,Sederhana and teruk,0.6,,Malay


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 527 entries, 0 to 526
Data columns (total 4 columns):
 #   Column                                                                                 Non-Null Count  Dtype  
---  ------                                                                                 --------------  -----  
 0   Review                                                                                 527 non-null    object 
 1   Score                                                                                  525 non-null    float64
 2   Are there ways for you to generate more data? Spliting up sentences, would that help?  0 non-null      float64
 3   Language                                                                               527 non-null    object 
dtypes: float64(2), object(2)
memory usage: 16.6+ KB


Col index 2 is a hint not actual usable data

Hint: **Are there ways for you to generate more data? Spliting up sentences, would that help?**

In [6]:
df.drop(columns='Are there ways for you to generate more data? Spliting up sentences, would that help?', inplace=True)

In [7]:
df['Language'].unique()

array(['Malay', 'English', 'Chinese', 'Nippon'], dtype=object)

In [8]:
# Note sure what Nippon is
df[df['Language'] == 'Nippon']

Unnamed: 0,Review,Score,Language
484,Nani kore,0.997413,Nippon


Looks like **Nippon** is **Japanese**, **Nani Kore** roughly translating to **What's This?** (condescendingly)

In [9]:
df.iloc[0:3]
           

Unnamed: 0,Review,Score,Language
0,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.1,Malay
1,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.9,Malay
2,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.5,Malay


These 3 have the same review but different score

In [10]:
df.iloc[0]['Review']

'Filem ini hebat! Aksi yang mendebarkan dan plot yang mengejutkan.'

Acoording to **Google Translate** this roughly translates to _This movie is great! Thrilling action and a surprising plot_

##### Trying to figure out what **Score** means

In [11]:
df[(df['Language'] == 'English') & (df['Score'] > 0.75)]

Unnamed: 0,Review,Score,Language
212,Avengers: Endgame is disappointing and too cli...,0.85,English
219,I'm not satisfied with how the main actors del...,0.8,English
236,This film is very disappointing. Not worth the...,0.8,English
262,Lack of character development makes this film ...,0.8,English
294,This film is highly disappointing. It fails to...,0.8,English
430,Disappointing,0.8,English
472,Disappointing resolution,0.8,English


In [12]:
df[(df['Language'] == 'English') & (df['Score'] < 0.25)]

Unnamed: 0,Review,Score,Language
206,This movie is amazing! The action is breathtak...,0.08,English
208,The storyline is touching and full of emotion....,0.12,English
209,This movie delivers an outstanding experience....,0.05,English
211,I'm thrilled throughout the movie. Engaging st...,0.15,English
214,This film is truly impressive with deep emotio...,0.12,English
...,...,...,...
501,"Visually stunning, emotionally resonant journey",0.08,English
502,"Compelling story, outstanding performances",0.12,English
503,"Intricate plot, unexpected twists",0.10,English
505,"Breathtaking visuals, engaging narrative",0.08,English


Contrary to what may be expected, **a lower score actually represents more positive reviews**

Therefore score can roughly be defined as **How much I dislike this movie**.

Likewise rows with missing scores can be infered as **un-sentimental review** (like just stopwords or generic words)

In [13]:
df[df['Score'].isnull()]

Unnamed: 0,Review,Score,Language
426,Ni hao,,Chinese
493,Diabolic,,English


In [14]:
# We can drop the missing score cols
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 525 entries, 0 to 526
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Review    525 non-null    object 
 1   Score     525 non-null    float64
 2   Language  525 non-null    object 
dtypes: float64(1), object(2)
memory usage: 16.4+ KB


In [15]:
## Looking at chinese reviews
df[df['Language'] == 'Chinese']

Unnamed: 0,Review,Score,Language
427,Hen tai fei chang hao,0.001,Chinese


In [16]:
df[df['Language'] == 'Malay']

Unnamed: 0,Review,Score,Language
0,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.1,Malay
1,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.9,Malay
2,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.5,Malay
3,"Sayang sekali, pelakon tidak memberikan persem...",0.8,Malay
4,Jalan cerita yang kompleks dan penuh emosi. Su...,0.2,Malay
...,...,...,...
522,Pening,0.7,Malay
523,Berkesan,0.2,Malay
524,Mengujakan,0.1,Malay
525,Sederhana and teruk,0.6,Malay


In [17]:
df[df['Language'] == 'English']

Unnamed: 0,Review,Score,Language
206,This movie is amazing! The action is breathtak...,0.08,English
207,I'm disappointed with the actors' performance....,0.70,English
208,The storyline is touching and full of emotion....,0.12,English
209,This movie delivers an outstanding experience....,0.05,English
210,"Too many action scenes, sometimes unnecessary....",0.40,English
...,...,...,...
502,"Compelling story, outstanding performances",0.12,English
503,"Intricate plot, unexpected twists",0.10,English
504,"Underwhelming moments, weak characterizations",0.60,English
505,"Breathtaking visuals, engaging narrative",0.08,English


Currently there is insufficient data for Chinese and Japan to effectively train our model, for the sake of it we will **drop both chinese and japan reviews**

In [18]:
df = df[~df['Language'].isin(['Chinese', 'Japanese'])]

In [19]:
# View Duplicated Columns
df[df.duplicated(subset='Review' , keep=False)]

Unnamed: 0,Review,Score,Language
0,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.10,Malay
1,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.90,Malay
2,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.50,Malay
16,Jalan cerita yang rumit tetapi sangat menarik....,0.20,Malay
18,Filem ini benar-benar membuat saya terbawa sua...,0.10,Malay
...,...,...,...
433,Impressive,0.08,English
442,Complex,0.10,English
446,Intriguing,0.12,English
447,Moving,0.12,English


Since we have both Malay and English, we will **translate the malay reviews to english**

It will regress on the scores, trying to predict the sentiment for future/new data

In [None]:
from deep_translator import GoogleTranslator
import time

def translate_malay_to_english(text, max_retries=3):
    for attempt in range(max_retries):
        try:
            translator = GoogleTranslator(source='ms', target='en')
            translated = translator.translate(text)
            return translated if translated else text
        except:
            time.sleep(2)  # Simple delay between retries
    return text  # Return original if all retries fail

# Apply translation to Malay reviews
df_malay = df[df['Language'] == 'Malay'].copy()
df_malay['Review'] = df_malay['Review'].apply(translate_malay_to_english)
df_malay['Language'] = 'English'

# Combine with English reviews
df_english = pd.concat([df[df['Language'] == 'English'], df_malay], ignore_index=True)

<style>
  /*responsive*/
  img {
    width: 100%;
    height: auto;
  }
  </style>
##### What to do with duplicated reviews?

Currently there are some reviews (shown earlier in cell 13) where they are the **exact same review but with different scores**

**Why not take mean?**

As shown below, taking the mean will result in a score of 0.5

<img src='./markdown/dup_malay_reviews.png'>

<br>

This does not reflect on what the actual review means _('This movie is great! Thrilling action and a surprising plot')_

##### What do we do?? 

Simply we can drop all of the duplicated review, this prevents inconsistencies _(like shown above)_ from affecting our data.


--- 

In [None]:
df_english = df_english.drop_duplicates(subset='Review', keep='first').reset_index(drop=True)

##### Anamoly Detection

As nature of language, **sarcasm** exists. 

To detect this we can cluster the sentiments against the TF-IDF vectorization.

This allows us to detect any 'sarcasm' within our dataset.

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_english['Reviews'])

In [None]:
features = hstack([tfidf_matrix , df_english['Scores']])

In [None]:
kmeans = KMeans(n_clusters=3 , random_state=42)
kmeans.fit(features)

In [None]:
# plotting it
pca = PCA(n_components=2)
features_2d = pca.fit_transform(features)

labels = kmeans.labels_


plt.figure(figsize=(10, 6))

for i in range(3):
    cluster_points = features_2d[labels == i]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {i}", alpha=0.6)

# Optional: Plot cluster centers
centers_2d = pca.transform(kmeans.cluster_centers_)
plt.scatter(centers_2d[:, 0], centers_2d[:, 1], c='black', marker='X', s=200, label='Centers')

plt.title("KMeans Clustering (PCA 2D Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.grid(True)
plt.show()

##### Data Engineering

Currently our data is too little to train a NN. We need to force out more data from this dataset

##### How to Force

1. We will split sentences into clauses, **usually seperated via punctuation**

e.g.

> This movie is amazing! The action is breathtaking, and the plot is intriguing.

can become:

> This movie is amazing! 

> The action is breathtaking

> the plot is intriguing

We can assign new scores to each subsentence

---

2. From the Clauses we can **Join with Conjunctions**

e.g. 

> This movie is amazing! 

> The action is breathtaking

can become: 

> The movie is amazing and the action is breathtaking. 

---

3. We can perform **sentence shuffling around a conjunction**

e.g. 

> The movie is amazing and the action is breathtaking. 

can become:

> The action is breathtaking and the movie is amazing.

---

4. We can **shuffle adjective to its synonyms and antonyms**

e.g.

> The movie is amazing and the action is breathtaking

can become: 

> The movie is astonishing and the action is spectacular

--- 

In [1]:
def get_synonym(word):
  synonyms = set()
  for syn in wordnet.synsets(word, pos=wordnet.ADJ):
    for lemma in syn.lemmas():
        if lemma.name().lower() != word.lower():
            synonyms.add(lemma.name().replace("_", " "))
  return synonyms.pop() if synonyms else None # popping a set gives something random

In [2]:
def get_antonym(word):
    antonyms = set()
    for syn in wordnet.synsets(word, pos=wordnet.ADJ):
        for lemma in syn.lemmas():
            for ant in lemma.antonyms():             # Check for antonyms
                antonyms.add(ant.name().replace("_", " "))
    return antonyms.pop() if antonyms else None

In [None]:
def adjective_replacement(df):
  reviews = df['Review']
  scores = df['Score']

  results = pd.DataFrame({
    'Review': [], 'Score': []
  })

  for review , score in zip(reviews , scores):
    tagged_review = pos_tag(review)
    
    # synonym and antonym replacement
    pos_reviews = []
    neg_reviews = []
    for word , tag in tagged_review:
      if tag.startswith('JJ'):
        syns = get_synonym(word.lower())
        antys = get_antonym(word.lower())

        pos_reviews.append(syns if syns else word)
        neg_reviews.append(antys if antys else word)

    pos_reviews = TreebankWordDetokenizer().detokenize(pos_reviews)
    neg_reviews = TreebankWordDetokenizer().detokenize(neg_reviews)
    
    results = pd.concat([results,
                         pd.DataFrame({
                          'Review': [pos_reviews , neg_reviews],
                          'Score': [score , 1-score],
                         })
    ], ignore_index=True)

    return results
    


In [None]:
adjective_swaped = adjective_replacement(df_english)

In [27]:
def get_clauses(df):
  reviews = df['Review']
  scores = df['Score']

  results = pd.DataFrame({
    'Review': [], 'Score': []
  })

  # sub clauses
  for review, score in zip(reviews , scores):

    clauses = re.split(r'[.?!,]' , review.strip())

    cleaned_clauses = [c.strip() for c in clauses if c != '']

    # remove additional 'and'
    for i in range(len(cleaned_clauses)):
      if cleaned_clauses[i].strip().startswith('and'):
        cleaned_clauses[i] = cleaned_clauses[i].strip()[3:]

      # This isn't grammatically correct but incase
      if cleaned_clauses[i].strip().endswith('and'):
        cleaned_clauses[i] = cleaned_clauses[i].strip()[:-3]


    
    for i , clause in enumerate(cleaned_clauses):
      clause = clause.strip() # remove additional whitespace
      results = pd.concat([results,
                 pd.DataFrame({
                    'Review': [clause],
                    'Score': [score / len(cleaned_clauses)]            
                 })
      ], ignore_index=True)
                  

      # clause join and rotation about conjunction
      if i != len(cleaned_clauses) -1:
        conjuction_clauses = [f'{clause} and {cleaned_clauses[i + 1]}' ,f'{cleaned_clauses[i + 1]} and {clause}' ]
        results = pd.concat([results,
                   pd.DataFrame({
                      'Review': conjuction_clauses,
                      'Score': [2 * (score / len(cleaned_clauses))] * 2           
                   })
        ], ignore_index=True)
  return results
  

In [28]:
additional_data_english = get_clauses(df_english)

In [29]:
additional_data_english

Unnamed: 0,Review,Score
0,This movie is amazing,0.306667
1,This movie is amazing and The action is breath...,0.613333
2,The action is breathtaking and This movie is a...,0.613333
3,The action is breathtaking,0.306667
4,The action is breathtaking and the plot is in...,0.613333
...,...,...
1891,Dizzy,0.300000
1892,Effective,0.800000
1893,Exciting,0.900000
1894,Moderate and severe,0.400000


In [30]:
large_review_df_english = pd.concat([df_english , additional_data_english] , ignore_index=True)
large_review_df_english

Unnamed: 0,Review,Score,Language
0,This movie is amazing! The action is breathtak...,0.92,English
1,I'm disappointed with the actors' performance....,0.30,English
2,The storyline is touching and full of emotion....,0.88,English
3,This movie delivers an outstanding experience....,0.95,English
4,"Too many action scenes, sometimes unnecessary....",0.60,English
...,...,...,...
2284,Dizzy,0.30,
2285,Effective,0.80,
2286,Exciting,0.90,
2287,Moderate and severe,0.40,


In [31]:
# We can now export this data
large_review_df_english.to_csv(os.path.join('stored_data' , 'cleaned_data' , 'cleaned_review_data.csv'), index=False)

In [32]:
# in future we can import the data 
large_review_df_english = pd.read_csv(os.path.join('stored_data' , 'cleaned_data' , 'cleaned_review_data.csv'))

##### Data Preperation

In [33]:
def clean_text(text):
  text = text.lower()

  # Remove Punctuation
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  # Remove whitespace
  text = re.sub(r'\s+', ' ', text).strip()

  return text

In [34]:
large_review_df_english['Review'] = large_review_df_english['Review'].apply(clean_text)

In [35]:
large_review_df_english

Unnamed: 0,Review,Score,Language
0,this movie is amazing the action is breathtaki...,0.92,English
1,im disappointed with the actors performance no...,0.30,English
2,the storyline is touching and full of emotion ...,0.88,English
3,this movie delivers an outstanding experience ...,0.95,English
4,too many action scenes sometimes unnecessary t...,0.60,English
...,...,...,...
2284,dizzy,0.30,
2285,effective,0.80,
2286,exciting,0.90,
2287,moderate and severe,0.40,


In [36]:
def lemmatize_reviews(x):
  lemmatizer = WordNetLemmatizer()
  words = word_tokenize(x)
  lemmatized_review_words = [lemmatizer.lemmatize(word) for word in words]
  return ' '.join(lemmatized_review_words)
                  



In [37]:
large_review_df_english['Lemmatized_Reviews'] = large_review_df_english['Review'].apply(lemmatize_reviews)

In [38]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(large_review_df_english['Lemmatized_Reviews'])
sequences = tokenizer.texts_to_sequences(large_review_df_english['Lemmatized_Reviews'])

In [None]:
def create_model(
  input_dim,
  n,
  neurons,
  output_dim,
  dropout,
  input_length,
  lr,
  recurrent_dropout = 0,
):
  model = Sequential()
  model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

  for i in range(n):
    model.add(LSTM(round(neurons) , return_sequences=(i == n - 1) , dropout=dropout , recurrent_dropout=recurrent_dropout))
    neurons /= 2

  model.add(Dropout(dropout))
  model.add(Dense(1 , activation='sigmoid'))
  model.compile(loss='mean_squared_error' , optimizer=Adam(learning_rate = lr), metrics=['mean_absolute_error'])

  return model


In [None]:
hyper_params = {
  'max_len': [90 , 95 , 100],  
  'n': [1 , 2] ,
  'output_dim': [128, 256, 512],
  'neurons': [64 , 128 , 256],
  'dropout': np.arange(0.5 , 0.9 , 0.1), 
  'lr': [1e-3 , 3e-3 , 1e-4 , 5e-4 , 1e-5]
}

In [42]:
X_train , X_test , y_train , y_test = train_test_split(sequences, large_review_df_english['Score'], train_size=0.8, random_state =42)

In [43]:
X_train  , X_val , y_train , y_val = train_test_split(X_train , y_train, train_size=0.75 , random_state=42)

In [44]:
val_losses = []
val_maes = []
fit_history = []
successful_params = []

In [None]:
sequence_lens = [len(s) for s in sequences]
vocab_size = len(tokenizer.word_index) + 1

for params in list(ParameterGrid(hyper_params)):
  try:
    max_len = int(np.percentile(sequence_lens , params['max_len']))
    print(f'Tuning {params}...')

    padded_sequences = pad_sequences(sequences, padding='post', maxlen=max_len)

    
    X_train , X_test , y_train , y_test = train_test_split(padded_sequences, large_review_df_english['Score'], train_size=0.8, random_state =42)
    X_train  , X_val , y_train , y_val = train_test_split(X_train , y_train, train_size=0.75 , random_state=42)

    early_stop = EarlyStopping(patience=10 , monitor='val_mean_absolute_error' , mode='min')
    model = create_model(**{k:v for k , v in params.items() if k not in ['max_len']} , input_dim=vocab_size, input_length=max_len)

    history = model.fit(X_train , y_train,
              epochs = 100,
              verbose = 0,
              validation_data = (X_val , y_val),
              callbacks = [ early_stop ] 
    )

    val_loss , val_mae = model.evaluate(X_val , y_val)
    print(f'val loss: {val_loss} , val mae : {val_mae}\n')

    val_losses.append(val_loss)
    val_maes.append(val_mae)
    fit_history.append(history.history)
    successful_params.append(params)

  except Exception as e:
    print(e)
    continue


  



Tuning {'dropout': 0.7, 'max_len': 90, 'n': 1, 'neurons': 256, 'output_dim': 128, 'recurrent_dropout': 0}...
val loss: 0.032590385526418686 , val mae : 0.1196647360920906
Tuning {'dropout': 0.7, 'max_len': 100, 'n': 1, 'neurons': 256, 'output_dim': 128, 'recurrent_dropout': 0}...
val loss: 0.02173999324440956 , val mae : 0.09725571423768997
Tuning {'dropout': 0.6, 'max_len': 90, 'n': 2, 'neurons': 256, 'output_dim': 512, 'recurrent_dropout': 0}...
Input 0 of layer "lstm_3" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 256)
Tuning {'dropout': 0.7999999999999999, 'max_len': 100, 'n': 2, 'neurons': 64, 'output_dim': 128, 'recurrent_dropout': 0}...
Input 0 of layer "lstm_5" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 64)
Tuning {'dropout': 0.5, 'max_len': 90, 'n': 1, 'neurons': 64, 'output_dim': 256, 'recurrent_dropout': 0}...
val loss: 0.03656771406531334 , val mae : 0.12575159966945648
Tuning {'dr

In [46]:
results = pd.DataFrame({
 'fit_results': fit_history,
 'val_losses': val_losses,
 'val_mae': val_maes,
 'successful_params': successful_params 
})

In [48]:
with open(os.path.join('tuning_results' , 'rnn_tuning.pkl') , 'wb') as f:
  pickle.dump(results, f)