<a href="https://colab.research.google.com/github/shivji-sj/ml_projects/blob/main/Sentiment_Analysis_using_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Sentiment Analysis Using RNN - https://www.geeksforgeeks.org/python/sentiment-analysis-with-an-recurrent-neural-networks-rnn/

Objective - Find the Sentiment of Customers on the behalf of customer sentiment. We will take action to do the customer service best.  

In [59]:
# 1. Importing libraries and dataset

import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding

In [60]:
# 2. Data Reading and Understaing
data = pd.read_csv('/content/swiggy.csv')
print('Columns in the dataset:')

# clone the data
df = data.copy()

print(data.columns)
print(data.shape)
data.sample(2)

Columns in the dataset:
Index(['ID', 'Area', 'City', 'Restaurant Price', 'Avg Rating', 'Total Rating',
       'Food Item', 'Food Type', 'Delivery Time', 'Review'],
      dtype='object')
(8000, 10)


Unnamed: 0,ID,Area,City,Restaurant Price,Avg Rating,Total Rating,Food Item,Food Type,Delivery Time,Review
1609,1610,Business District,Pune,200,4.2,750,Chicken Wings,Non-Vegetarian,30-40 min,Disappointed.
207,208,Suburb,Mumbai,700,4.5,278,Chow Mein,Fast Food,40-50 min,Perfectly cooked and well-seasoned.


In [61]:
# for i in df.columns:
  # print(df[i].value_counts(), "\n")

# df.columns

# df.describe().T

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                8000 non-null   int64  
 1   Area              8000 non-null   object 
 2   City              8000 non-null   object 
 3   Restaurant Price  8000 non-null   int64  
 4   Avg Rating        8000 non-null   float64
 5   Total Rating      8000 non-null   int64  
 6   Food Item         8000 non-null   object 
 7   Food Type         8000 non-null   object 
 8   Delivery Time     8000 non-null   object 
 9   Review            8000 non-null   object 
dtypes: float64(1), int64(3), object(6)
memory usage: 625.1+ KB


In [62]:
# 3. Text Cleaning and Sentiment Labeling

df.Review = df.Review.str.lower() # in lower case
df.Review = df.Review.replace(r'[^a-z0-9/s]', "", regex=True) # remove special characters
# df.Review.str.len()

df["Sentiment"] = df["Avg Rating"].apply(lambda x: 1 if x > 3.5 else 0)  # Rating 3.5 more than consider as postive label as "1" else negeative as "0"
df = df.dropna()
print(df.shape)

(8000, 11)


In [63]:
df.head()

Unnamed: 0,ID,Area,City,Restaurant Price,Avg Rating,Total Rating,Food Item,Food Type,Delivery Time,Review,Sentiment
0,1,Suburb,Ahmedabad,600,4.2,6198,Sushi,Fast Food,30-40 min,goodbutnothingextraordinary,1
1,2,Business District,Pune,200,4.7,4865,Pepperoni Pizza,Non-Vegetarian,50-60 min,goodbutnothingextraordinary,1
2,3,Suburb,Bangalore,600,4.7,2095,Waffles,Fast Food,50-60 min,latedeliveryruinedit,1
3,4,Business District,Mumbai,900,4.0,6639,Sushi,Vegetarian,50-60 min,bestmealivehadinawhile,1
4,5,Tech Park,Mumbai,200,4.7,6926,Spring Rolls,Gluten-Free,20-30 min,mediocreexperience,1


In [64]:
# 4. Tokenization and Padding
# Tokenizer converts words into integer sequences and padding ensures all input sequences have the same length (max_length).Tokenizer converts words into integer sequences and padding ensures all input sequences have the same length (max_length).

max_features = 1000 # define 1000 as max_features
max_length = 100 # define 100 as max_length

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df["Review"])

X = pad_sequences(tokenizer.texts_to_sequences(df["Review"]), maxlen=max_length)
y = df["Sentiment"]

In [65]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)


In [66]:
# 5. Building RNN Model (Recurrent Neural Network)

model = Sequential(
    [Embedding(input_dim = max_features, output_dim=16, input_length=max_length),
    SimpleRNN(64, activation="tanh", return_sequences=False),
    Dense(1, activation="sigmoid")
    ]) # simplicity, intuitive structure, and ease of use when building standard, linear neural networks

model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=["accuracy"]
             )



In [67]:
# 6. Training and Evaluating Model

history = model.fit(
    X_train,
    y_train,
    epochs=15,
    batch_size=32,
    validation_data= (X_test, y_test),
    verbose=1

)

score = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy : {score[1]:.2f}")

Epoch 1/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 21ms/step - accuracy: 0.6928 - loss: 0.6152 - val_accuracy: 0.7156 - val_loss: 0.5981
Epoch 2/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.7180 - loss: 0.5986 - val_accuracy: 0.7156 - val_loss: 0.5998
Epoch 3/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.7060 - loss: 0.6071 - val_accuracy: 0.7156 - val_loss: 0.5996
Epoch 4/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.7195 - loss: 0.5937 - val_accuracy: 0.7156 - val_loss: 0.6037
Epoch 5/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 35ms/step - accuracy: 0.7152 - loss: 0.5987 - val_accuracy: 0.7156 - val_loss: 0.5975
Epoch 6/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.7175 - loss: 0.5961 - val_accuracy: 0.7156 - val_loss: 0.6000
Epoch 7/15
[1m200/200

In [71]:
# 7. Predict Sentiment

def predict_sentiment(review_text):
  text = review_text.lower()
  text = re.sub(r'[a-z0-9\s]', "", text)

  seq = tokenizer.texts_to_sequences([text])
  padded = pad_sequences(seq, maxlen=max_length)

  prediction = model.predict(padded)[0][0]
  return f"{'Positive' if prediction >= 0.5 else 'Negative'} (Probability : {prediction:.2f})"

sample_review = input("Enter a review: ")
print(f"Review: {sample_review}")
print(f"Sentiment: {predict_sentiment(sample_review)}")

Enter a review: amazing
Review: amazing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Sentiment: Positive (Probability : 0.75)


In [72]:
# Model saving
import joblib
joblib.dump(model, "model.sav")

# For Load the model
# model = joblib.load("model.sav")

['model.sav']