### Section B CA1 RNN

Name: Jovan Heng Ghim Hong

Class: DAAA/2B/22

Admin No: 2401418

In [1]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [148]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import re
import math

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense , Dropout, LSTM, GRU, SimpleRNN, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split, ParameterGrid
warnings.filterwarnings('ignore')

In [58]:
df = pd.read_csv(os.path.join('datasets' , 'Movie reviews.csv'))

##### Basic Data Exploration

In [59]:
df

Unnamed: 0,Review,Score,"Are there ways for you to generate more data? Spliting up sentences, would that help?",Language
0,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.1,,Malay
1,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.9,,Malay
2,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.5,,Malay
3,"Sayang sekali, pelakon tidak memberikan persem...",0.8,,Malay
4,Jalan cerita yang kompleks dan penuh emosi. Su...,0.2,,Malay
...,...,...,...,...
522,Pening,0.7,,Malay
523,Berkesan,0.2,,Malay
524,Mengujakan,0.1,,Malay
525,Sederhana and teruk,0.6,,Malay


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 527 entries, 0 to 526
Data columns (total 4 columns):
 #   Column                                                                                 Non-Null Count  Dtype  
---  ------                                                                                 --------------  -----  
 0   Review                                                                                 527 non-null    object 
 1   Score                                                                                  525 non-null    float64
 2   Are there ways for you to generate more data? Spliting up sentences, would that help?  0 non-null      float64
 3   Language                                                                               527 non-null    object 
dtypes: float64(2), object(2)
memory usage: 16.6+ KB


Col index 2 is a hint not actual usable data

Hint: **Are there ways for you to generate more data? Spliting up sentences, would that help?**

In [61]:
df.drop(columns='Are there ways for you to generate more data? Spliting up sentences, would that help?', inplace=True)

In [62]:
df['Language'].unique()

array(['Malay', 'English', 'Chinese', 'Nippon'], dtype=object)

In [63]:
# Note sure what Nippon is
df[df['Language'] == 'Nippon']

Unnamed: 0,Review,Score,Language
484,Nani kore,0.997413,Nippon


Looks like **Nippon** is **Japanese**, **Nani Kore** roughly translating to **What's This?** (condescendingly)

In [65]:
df.iloc[0:3]
           

Unnamed: 0,Review,Score,Language
0,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.1,Malay
1,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.9,Malay
2,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.5,Malay


These 3 have the same review but different score

In [66]:
df.iloc[0]['Review']

'Filem ini hebat! Aksi yang mendebarkan dan plot yang mengejutkan.'

Acoording to **Google Translate** this roughly translates to _This movie is great! Thrilling action and a surprising plot_

##### Trying to figure out what **Score** means

In [67]:
df[(df['Language'] == 'English') & (df['Score'] > 0.75)]

Unnamed: 0,Review,Score,Language
212,Avengers: Endgame is disappointing and too cli...,0.85,English
219,I'm not satisfied with how the main actors del...,0.8,English
236,This film is very disappointing. Not worth the...,0.8,English
262,Lack of character development makes this film ...,0.8,English
294,This film is highly disappointing. It fails to...,0.8,English
430,Disappointing,0.8,English
472,Disappointing resolution,0.8,English


In [68]:
df[(df['Language'] == 'English') & (df['Score'] < 0.25)]

Unnamed: 0,Review,Score,Language
206,This movie is amazing! The action is breathtak...,0.08,English
208,The storyline is touching and full of emotion....,0.12,English
209,This movie delivers an outstanding experience....,0.05,English
211,I'm thrilled throughout the movie. Engaging st...,0.15,English
214,This film is truly impressive with deep emotio...,0.12,English
...,...,...,...
501,"Visually stunning, emotionally resonant journey",0.08,English
502,"Compelling story, outstanding performances",0.12,English
503,"Intricate plot, unexpected twists",0.10,English
505,"Breathtaking visuals, engaging narrative",0.08,English


Contrary to what may be expected, **a lower score actually represents more positive reviews**


Therefore score can roughly be defined as **How much I dislike this movie**.

Likewise rows with missing scores can be infered as **un-sentimental review** (like just stopwords or generic words)

In [69]:
df[df['Score'].isnull()]

Unnamed: 0,Review,Score,Language
426,Ni hao,,Chinese
493,Diabolic,,English


In [70]:
# We can drop the missing score cols
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 525 entries, 0 to 526
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Review    525 non-null    object 
 1   Score     525 non-null    float64
 2   Language  525 non-null    object 
dtypes: float64(1), object(2)
memory usage: 16.4+ KB


In [71]:
## Looking at chinese reviews
df[df['Language'] == 'Chinese']

Unnamed: 0,Review,Score,Language
427,Hen tai fei chang hao,0.001,Chinese


In [72]:
df[df['Language'] == 'Malay']

Unnamed: 0,Review,Score,Language
0,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.1,Malay
1,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.9,Malay
2,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.5,Malay
3,"Sayang sekali, pelakon tidak memberikan persem...",0.8,Malay
4,Jalan cerita yang kompleks dan penuh emosi. Su...,0.2,Malay
...,...,...,...
522,Pening,0.7,Malay
523,Berkesan,0.2,Malay
524,Mengujakan,0.1,Malay
525,Sederhana and teruk,0.6,Malay


In [73]:
df[df['Language'] == 'English']

Unnamed: 0,Review,Score,Language
206,This movie is amazing! The action is breathtak...,0.08,English
207,I'm disappointed with the actors' performance....,0.70,English
208,The storyline is touching and full of emotion....,0.12,English
209,This movie delivers an outstanding experience....,0.05,English
210,"Too many action scenes, sometimes unnecessary....",0.40,English
...,...,...,...
502,"Compelling story, outstanding performances",0.12,English
503,"Intricate plot, unexpected twists",0.10,English
504,"Underwhelming moments, weak characterizations",0.60,English
505,"Breathtaking visuals, engaging narrative",0.08,English


Currently there is insufficient data for Chinese and Japan to effectively train our model, for the sake of it we will **drop both chinese and japan reviews**

In [74]:
df = df[~df['Language'].isin(['Chinese', 'Japanese'])]

In [75]:
# View Duplicated Columns
df[df.duplicated(subset='Review' , keep=False)]

Unnamed: 0,Review,Score,Language
0,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.10,Malay
1,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.90,Malay
2,Filem ini hebat! Aksi yang mendebarkan dan plo...,0.50,Malay
16,Jalan cerita yang rumit tetapi sangat menarik....,0.20,Malay
18,Filem ini benar-benar membuat saya terbawa sua...,0.10,Malay
...,...,...,...
433,Impressive,0.08,English
442,Complex,0.10,English
446,Intriguing,0.12,English
447,Moving,0.12,English


In [76]:
df[df['Language'] == 'English']['Review'].values[0]

'This movie is amazing! The action is breathtaking, and the plot is intriguing.'

Since we have both Malay and English, we will crease **2 language specific regression models**. 

It will regress on the scores, trying to predict the sentiment for future/new data

##### Data Engineering

Currently our data is too little to train a NN. We need to force out more data from this dataset

##### How to Force

1. We will split sentences into clauses, **usually seperated via punctuation**

e.g.

> This movie is amazing! The action is breathtaking, and the plot is intriguing.

can become:

> This movie is amazing! 

> The action is breathtaking

> the plot is intriguing

We can assign new scores to each subsentence

<br> 
<br> 

2. From the Clauses we can **Join with Conjunctions**

e.g. 

> This movie is amazing! 

> The action is breathtaking

can become: 

> The movie is amazing and the action is breathtaking. 

<br>
<br>

3. We can perform **sentence shuffling around a conjunction**

e.g. 

> The movie is amazing and the action is breathtaking. 

can become:

> The action is breathtaking and the movie is amazing.

<br>
<br>

4. We can **shuffle adjective to its synonyms**

e.g.

> The movie is amazing and the action is breathtaking

can become: 

> The movie is astonishing and the action is spectacular


In [77]:
# do 1 - x to 'reverse' the scoring
# thereby higher == pos and lower == neg (more intuitive)
df['Score'] = df['Score'].apply(lambda x: 1 - x)

In [78]:
df_english = df[df['Language'] == 'English']
df_english

Unnamed: 0,Review,Score,Language
206,This movie is amazing! The action is breathtak...,0.92,English
207,I'm disappointed with the actors' performance....,0.30,English
208,The storyline is touching and full of emotion....,0.88,English
209,This movie delivers an outstanding experience....,0.95,English
210,"Too many action scenes, sometimes unnecessary....",0.60,English
...,...,...,...
502,"Compelling story, outstanding performances",0.88,English
503,"Intricate plot, unexpected twists",0.90,English
504,"Underwhelming moments, weak characterizations",0.40,English
505,"Breathtaking visuals, engaging narrative",0.92,English


In [142]:
df[(df.duplicated(subset='Review' , keep=False)) & (df['Language'] == 'English')]

Unnamed: 0,Review,Score,Language
234,The visual effects and animation in this film ...,0.9,English
242,Lack of character development makes this film ...,0.25,English
262,Lack of character development makes this film ...,0.2,English
263,The visual effects and animation in this film ...,0.95,English
359,The time-travel element adds complexity to the...,0.88,English
360,Avengers: Endgame's visual effects are a techn...,0.92,English
361,While the film provides closure to some charac...,0.7,English
369,"Avengers: Endgame's pacing is a triumph, seaml...",0.94,English
370,While Avengers: Endgame provides closure to so...,0.6,English
371,The emotional weight of Avengers: Endgame's na...,0.93,English


In [138]:
def get_clauses(df):
  reviews = df['Review']
  scores = df['Score']

  results = pd.DataFrame({
    'Review': [], 'Score': []
  })

  # sub clauses
  for review, score in zip(reviews , scores):

    clauses = re.split(r'[.?!,]' , review.strip())

    cleaned_clauses = [c.strip() for c in clauses if c != '']

    # remove additional 'and'
    for i in range(len(cleaned_clauses)):
      if cleaned_clauses[i].strip().startswith('and'):
        cleaned_clauses[i] = cleaned_clauses[i].strip()[3:]

      # This isn't grammatically correct but incase
      if cleaned_clauses[i].strip().endswith('and'):
        cleaned_clauses[i] = cleaned_clauses[i].strip()[:-3]


    
    for i , clause in enumerate(cleaned_clauses):
      clause = clause.strip() # remove additional whitespace
      results = pd.concat([results,
                 pd.DataFrame({
                    'Review': [clause],
                    'Score': [score / len(cleaned_clauses)]            
                 })
      ], ignore_index=True)
                  

      # clause join and rotation about conjunction
      if i != len(cleaned_clauses) -1:
        conjuction_clauses = [f'{clause} and {cleaned_clauses[i + 1]}' ,f'{cleaned_clauses[i + 1]} and {clause}' ]
        results = pd.concat([results,
                   pd.DataFrame({
                      'Review': conjuction_clauses,
                      'Score': [2 * (score / len(cleaned_clauses))] * 2           
                   })
        ], ignore_index=True)
  return results
  

In [139]:
additional_data_english = get_clauses(df_english)

In [140]:
additional_data_english

Unnamed: 0,Review,Score
0,This movie is amazing,0.306667
1,This movie is amazing and The action is breath...,0.613333
2,The action is breathtaking and This movie is a...,0.613333
3,The action is breathtaking,0.306667
4,The action is breathtaking and the plot is in...,0.613333
...,...,...
1699,engaging narrative,0.460000
1700,Top-notch effects,0.460000
1701,Top-notch effects and captivating storytelling,0.920000
1702,captivating storytelling and Top-notch effects,0.920000


In [143]:
large_review_df_english = pd.concat([df_english , additional_data_english] , ignore_index=True)
large_review_df_english

Unnamed: 0,Review,Score,Language
0,This movie is amazing! The action is breathtak...,0.92,English
1,I'm disappointed with the actors' performance....,0.30,English
2,The storyline is touching and full of emotion....,0.88,English
3,This movie delivers an outstanding experience....,0.95,English
4,"Too many action scenes, sometimes unnecessary....",0.60,English
...,...,...,...
1996,engaging narrative,0.46,
1997,Top-notch effects,0.46,
1998,Top-notch effects and captivating storytelling,0.92,
1999,captivating storytelling and Top-notch effects,0.92,


##### Data Preperation

In [144]:
def clean_text(text):
  text = text.lower()

  # Remove Punctuation
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  # Remove whitespace
  text = re.sub(r'\s+', ' ', text).strip()

  return text

In [145]:
large_review_df_english['Review'] = large_review_df_english['Review'].apply(clean_text)

In [146]:
large_review_df_english

Unnamed: 0,Review,Score,Language
0,this movie is amazing the action is breathtaki...,0.92,English
1,im disappointed with the actors performance no...,0.30,English
2,the storyline is touching and full of emotion ...,0.88,English
3,this movie delivers an outstanding experience ...,0.95,English
4,too many action scenes sometimes unnecessary t...,0.60,English
...,...,...,...
1996,engaging narrative,0.46,
1997,topnotch effects,0.46,
1998,topnotch effects and captivating storytelling,0.92,
1999,captivating storytelling and topnotch effects,0.92,
