<a href="https://colab.research.google.com/github/prakan1684/Models/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import os
import ssl
import certifi

# Make sure SSL context is set properly for downloading
ssl._create_default_https_context = ssl._create_unverified_context
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())

# Define a known good directory for nltk_data
nltk_data_path = "/kaggle/working/nltk_data"
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)

# Force download WordNet to the correct directory
nltk.download("wordnet", download_dir=nltk_data_path)
nltk.download("omw-1.4", download_dir=nltk_data_path)
nltk.download("punkt", download_dir=nltk_data_path)
nltk.download("stopwords", download_dir=nltk_data_path)
nltk.download('punkt_tab')

# Verify wordnet is working
from nltk.corpus import wordnet as wn

try:
    print(wn.synsets("dog"))
    print("✅ WordNet is working!")
except LookupError as e:
    print("❌ WordNet still not found!", e)

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data] Downloading package punkt to /kaggle/working/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]
✅ WordNet is working!


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os

In [4]:
train_df = pd.read_csv("/content/drive/MyDrive/reviewsdata/train.csv", names=["polarity", "title", "text"], header=None)
test_df = pd.read_csv("/content/drive/MyDrive/reviewsdata/test.csv", names=["polarity", "title", "text"], header=None)

In [5]:
print("Training Data Info:")
print(train_df.info())

Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   polarity  int64 
 1   title     object
 2   text      object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB
None


In [6]:
print("training data sample:")
print(train_df.head())

training data sample:
   polarity                                              title  \
0         2                     Stuning even for the non-gamer   
1         2              The best soundtrack ever to anything.   
2         2                                           Amazing!   
3         2                               Excellent Soundtrack   
4         2  Remember, Pull Your Jaw Off The Floor After He...   

                                                text  
0  This sound track was beautiful! It paints the ...  
1  I'm reading a lot of reviews saying that this ...  
2  This soundtrack is my favorite music of all ti...  
3  I truly like this soundtrack and I enjoy video...  
4  If you've played the game, you know how divine...  


In [7]:
train_df['polarity'] = train_df['polarity'] -1
test_df['polarity'] = test_df['polarity'] -1

In [8]:
train_df['polarity'].value_counts()

Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
1,1800000
0,1800000


# **STEP 2: Preprocessing**


## Preprocessing and data cleanup:
1. Convert all text to lowercase
2. remove punctuation and special characters
3. remove stop words
4. convert words to its base form (running->run)
5. tokenization, split text into words


## Necessary nltk libraries:
1. nltk for text processing
2. from nltk, we use stopwords from the corpus class
3. word-tokenize from tokenize class
4. WordNetLemmatizer from stem class

In [9]:
def preprocess_text(text):

    #make sure input is string
    if not isinstance(text, str):
        return ""

    # Lowercase text
    text = text.lower()

    text = re.sub(f"[{string.punctuation}]", "", text)

    words = word_tokenize(text)

    stop_words = set(stopwords.words("english"))

    words = [word for word in words if word not in stop_words]

    lemmatizer = WordNetLemmatizer()

    words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

In [10]:
train_df['clean_text'] = train_df["text"].apply(preprocess_text)
test_df['clean_text'] = test_df["text"].apply(preprocess_text)
train_df.head()

KeyboardInterrupt: 

In [12]:

# Load the cleaned training and test data
train_df = pd.read_csv("/content/drive/MyDrive/reviewsdata/cleaned_train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/reviewsdata/cleaned_test.csv")


In [17]:
train_df['clean_text'] = train_df['clean_text'].fillna("")
train_df['clean_text'] = train_df['clean_text'].astype(str)
test_df['clean_text'] = test_df['clean_text'].fillna("")
test_df['clean_text'] = test_df['clean_text'].astype(str)

# **Sentiment Analysis Using Deep Learning**
## **Introduction**
In this section, we will build a **Deep Learning model** to classify Amazon product reviews as **positive** or **negative** based on their text content.

### **Why Deep Learning?**
Traditional machine learning models, such as Logistic Regression and Random Forest, rely on manually engineered features like **TF-IDF**. While effective, they may not fully capture the context and meaning of words.

Deep Learning models, especially **Recurrent Neural Networks (RNNs)** like **LSTMs (Long Short-Term Memory)**, can:
- Understand the **order of words** in a sentence.
- Capture **contextual relationships** between words.
- Improve classification accuracy by learning **word embeddings**.

### **Approach**
We will:
1. **Preprocess the text** by tokenizing words and converting them into sequences.
2. **Pad sequences** to ensure all inputs have the same length.
3. **Use an LSTM-based neural network** for sentiment classification.
4. **Train the model** on our dataset and evaluate its performance.

In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [18]:
MAX_VOCAB_SIZE = 2000
MAX_SEQUENCE_LENGTH = 100


tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['clean_text'])

X_train_seq = tokenizer.texts_to_sequences(train_df['clean_text'])
X_test_seq = tokenizer.texts_to_sequences(test_df['clean_text'])

X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

y_train = np.array(train_df['polarity'])
y_test = np.array(test_df['polarity'])

X_train, X_val, y_train, y_val = train_test_split(X_train_padded, y_train, test_size=0.1, random_state=42)

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [20]:
model = Sequential([
    #convert word indexes into dense 128 dim  word vectors
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=128, input_length=MAX_SEQUENCE_LENGTH),

    #LSTM to retreive the input layer, remebers word order
    LSTM(128, return_sequences=True),
    #second lstm layer to reduce dimension
    LSTM(64),
    #drops 50% of neurons randomly to prevent overfitting
    Dropout(0.5),
    #fully connected layer for learning high level features
    Dense(64, activation='relu'),
    #final binary classification output layer
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [21]:
hisotry = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=64,
    validation_data = (X_val, y_val)
)

Epoch 1/5
[1m50625/50625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m552s[0m 11ms/step - accuracy: 0.6510 - loss: 0.5378 - val_accuracy: 0.8875 - val_loss: 0.2803
Epoch 2/5
[1m50625/50625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m548s[0m 11ms/step - accuracy: 0.8932 - loss: 0.2555 - val_accuracy: 0.8979 - val_loss: 0.2634
Epoch 3/5
[1m50625/50625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m546s[0m 11ms/step - accuracy: 0.9021 - loss: 0.2372 - val_accuracy: 0.8994 - val_loss: 0.2817
Epoch 4/5
[1m50625/50625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m541s[0m 11ms/step - accuracy: 0.9068 - loss: 0.2269 - val_accuracy: 0.8994 - val_loss: 0.2643
Epoch 5/5
[1m50625/50625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m537s[0m 11ms/step - accuracy: 0.9104 - loss: 0.2196 - val_accuracy: 0.9015 - val_loss: 0.2695


In [22]:
test_loss, test_acc = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 5ms/step - accuracy: 0.8993 - loss: 0.2733
Test Accuracy: 0.9005
