# Importing Necessary libraries

In [None]:
# !pip install nltk==3.7
# !pip install numpy==1.21.5
# !pip install pandas==1.3.5
# !pip install scikit_learn==1.0.2
# !pip install tensorflow==2.7.0
# !pip install torch==1.10.2
# !pip install matplotlib==3.4.3

In [None]:
import re
import nltk
import time
import torch
import string
import numpy as np
import pandas as pd
import torch.nn as nn
# import tensorflow as tf

nltk.download('stopwords')

In [None]:
# from tensorflow import keras
from nltk import PorterStemmer
import torch.nn.functional as F
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.model_selection import train_test_split


# Reading the Data 

In [None]:
#reading data
data = pd.read_csv('../Input/review_data (1).csv')
data.head()

# Information About Dataset

In [None]:
data.info()

In [None]:
#
pd.set_option('display.max_colwidth', -1)

In [None]:
 # Selecting the needed Column

data = data[['content','score']]
data

# Function to Clean the Text

In [None]:
 # Defining a Function to Clean the Textual Data
 
def clean_text(txt):
    
  txt = txt.lower() #Lowering the text
  txt = re.sub(r'\W', ' ', str(txt)) # remove all special characters including apastrophie 
  txt = txt.translate(str.maketrans('', '', string.punctuation)) # remove punctuations
  txt = ''.join([i for i in txt if not i.isdigit()]).strip() # remove digits ()
  txt = re.sub(r'\s+[a-zA-Z]\s+', ' ', txt)   # remove all single characters (it's -> it s then we need to remove s)
  txt = re.sub(r'\s+', ' ', txt, flags=re.I) # Substituting multiple spaces with single space
  txt = re.sub(r"(http\S+|http)", "", txt) # remove links
  txt = ' '.join([PorterStemmer().stem(word=word) for word in txt.split(" ") if word not in stopwords.words('english') ]) # stem & remove stop words
  return txt


# Comparison Between Orginal Text and Processed Text

In [None]:
print('Original Text : ',data['content'][1])  
print('Processed Text : ',clean_text(data['content'][1]))

# Applying the Function to the Dataset

In [None]:
data['content'] = data['content'].apply(clean_text) #apply the function to every text in the dataset
data

# Distribution of Classes in the dataset

In [None]:
data.score.value_counts() 

In [None]:
# we can see that class "5"  is dominating in the dataset. Thus we need to Balance the Dataset.

pd.value_counts(data['score']).plot.bar()

# Balancing the Dataset

In [None]:
# First we need to divide the dataset to each classes

df_majority = data[data['score']==5] #Data with class 5

df_minority1= data[data['score']==2] #Data with class 2

df_minority2 = data[data['score']==3] #Data with class 3

df_minority3 = data[data['score']==1] #Data with class 1

df_minority4 = data[data['score']==4] #Data with class 4

# Upasampling the Monority class and Downsampling the Majority Class

In [None]:
#Down Sampling Majority Class "5"
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    
                                 n_samples = 600)
#Upsample Minority class  "2"
df_minority_upsampled = resample(df_minority1, 
                                 replace=True,     
                                 n_samples=200)
#Upsample Minority class "3"
df_minority_upsampled1 = resample(df_minority2, 
                                 replace=True,     
                                 n_samples=300)
#Upsample Minority class "1"
df_minority_upsampled2 = resample(df_minority3, 
                                 replace=True,     
                                 n_samples=225)
#Upsample Minority class "4"
df_minority_upsampled3 = resample(df_minority4, 
                                 replace=True,     
                                 n_samples=250)


# Combine minority class with downsampled majority class
data1 = pd.concat([df_majority_downsampled, df_minority_upsampled,df_minority_upsampled1,df_minority_upsampled2,df_minority_upsampled3])

In [None]:
data1.score.value_counts()

# Now we have a Balanced Dataset

In [None]:
pd.value_counts(data1['score']).plot.bar()

# Defining the Parameters and Tokenizer

In [None]:
# !pip install tensorflow==2.0.0 --upgrade --force-reinstall

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 2000
# Max number of words in each Content.
MAX_SEQUENCE_LENGTH = 600
# This is fixed. Embedding
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data1['content'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
 #Tokenizing the content
 
X = tokenizer.texts_to_sequences(data1['content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape[1])

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(data1['score'])
print(Y.shape)
print(le.classes_)
Y.dtype

# Splitting Dataset to Train and Test Data

In [None]:
 #Train and Test Split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.10, random_state = 42, stratify=Y)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
# Y_test = torch.Tensor(Y_test.to_numpy())
# Y_test.dtype

In [None]:
 # Converting data into Torch and getting it into CPU

x_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(Y_train, dtype=torch.long)
x_cv = torch.tensor(X_test, dtype=torch.long)
y_cv = torch.tensor(Y_test, dtype=torch.long)

# Converting dataset to a Torch Datset

In [None]:
train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_cv, y_cv)

In [None]:
# Defing the Parameters:
max_features =  2000  
batch_size = 50
vocab_size = max_features

# Initialising the DataLoaders
train_dl = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
val_dl = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

# Defining the Model

In [None]:

class LSTM(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) : # 
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        #self.drop  = nn.Dropout(p=0.2)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.lstm1 = nn.LSTM(hidden_dim, 64, batch_first=True)
        self.linear = nn.Linear(64, 5)
        #self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.embeddings(x)
        #x = self.drop(x)
        out_pack, (ht, ct) = self.lstm(x)
        out_pack1, (ht, ct) = self.lstm1(out_pack)
        out = self.linear(ht[-1])
        #out = self.softmax(out)
        return out

# Structure of the Model

In [None]:
#intializing model
model = LSTM(vocab_size, 128,64)
print(model)

# Model Tranining And Validation

In [None]:
# Defining Train Loop:

n_epochs = 10

loss_fn = nn.CrossEntropyLoss() # Loss Function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # Optimiser

# model.cuda() # Moving Model Into GPU
# loss_fn.cuda() # Moving Loss Function Into GPU

train_loss = []
valid_loss = []
for epoch in range(n_epochs):
      start_time = time.time()



    # Set model to train configuration
      model.train() # indicator for training
      avg_loss = 0.  
      for i, (x_batch, y_batch) in enumerate(train_dl):
#           x_batch = x_batch.cuda()
#           y_batch = y_batch.cuda()


          # Predict/Forward Pass
          y_pred = model(x_batch)



          # Compute loss
          loss = loss_fn(y_pred, y_batch)
          optimizer.zero_grad()
          loss.backward() 
          optimizer.step()
          avg_loss += loss.item() / len(train_dl)
          



      # Set model to validation configuration
      model.eval() # Indicator for Validation       
      avg_val_loss = 0.
      val_preds = np.zeros((len(x_cv),len(le.classes_)))
    
      for i, (x_batch, y_batch) in enumerate(val_dl):
          y_pred = model(x_batch).detach()
          avg_val_loss += loss_fn(y_pred, y_batch).item() / len(val_dl)

           #keep/store predictions

          val_preds[i * batch_size:(i+1) * batch_size] =F.softmax(y_pred).cpu().numpy()
          
          # Check Accuracy
      val_accuracy = sum(val_preds.argmax(axis=1)==Y_test)/len(Y_test)
      train_loss.append(avg_loss)
      valid_loss.append(avg_val_loss)
      elapsed_time = time.time() - start_time 
      print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f}  \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, val_accuracy, elapsed_time))


#Plotting the Loss

In [None]:
#loss vs epoch graph
def plot_graph(epochs):
    fig = plt.figure(figsize=(12,12))
    plt.title("Train/Validation Loss")
    plt.plot(list(np.arange(epochs) + 1) , train_loss, label='train') # X axis 
    plt.plot(list(np.arange(epochs) + 1), valid_loss, label='validation') # Y axis
    plt.xlabel('num_epochs', fontsize=12)
    plt.ylabel('loss', fontsize=12)
    plt.legend(loc='best')

In [None]:
plot_graph(n_epochs)