<a href="https://colab.research.google.com/github/pradeepsengarr/Chatbot/blob/main/seamless_m4t_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import gradio as gr
from sklearn.model_selection import train_test_split
import os


In [5]:
# Install Kaggle API
!pip install kaggle
os.environ['KAGGLE_CONFIG_DIR'] = "/content/"



In [6]:
# Download IMDB Dataset from Kaggle
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip imdb-dataset-of-50k-movie-reviews.zip

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 97% 25.0M/25.7M [00:02<00:00, 21.3MB/s]
100% 25.7M/25.7M [00:02<00:00, 12.0MB/s]
Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [7]:
# Load Dataset
df = pd.read_csv("IMDB Dataset.csv")

In [10]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
# Balance the dataset
min_samples = min(df['sentiment'].value_counts())
df_positive = df[df['sentiment'] == 'positive'].sample(min_samples, random_state=42)
df_negative = df[df['sentiment'] == 'negative'].sample(min_samples, random_state=42)
df = pd.concat([df_positive, df_negative]).sample(frac=1, random_state=42)

In [13]:
print(f"The minimum sample size is {min_samples}")
print(f"The Positive sample size is {df_positive}")
print(f"The Negative sample size is {df_negative}")
print(f"The Total sample size is {df}")


The minimum sample size is 25000
The Positive sample size is                                                   review sentiment
13886  I don't know how or why this film has a meager...  positive
48027  For a long time it seemed like all the good Ca...  positive
19536  Terry Gilliam's and David Peoples' teamed up t...  positive
27232  What is there to say about an anti-establishme...  positive
28001  This movie was made only 48 years after the en...  positive
...                                                  ...       ...
43116  This movie had it all,action,comedy,heroics,an...  positive
10767  The Thing has to be one of the all time great ...  positive
1685   It's unbelievable but the fourth is better tha...  positive
31574  First things first, Edison Chen did a fantasti...  positive
47296  It gives the ordinary guy/girl the chance to b...  positive

[25000 rows x 2 columns]
The Negative sample size is                                                   review sentiment
13625  I was l

In [14]:
def preprocess_text(df, vocab_size=10000, max_length=100):
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(df['review'])
    sequences = tokenizer.texts_to_sequences(df['review'])
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
    labels = np.array([1 if label == 'positive' else 0 for label in df['sentiment']])
    return padded_sequences, labels, tokenizer

In [15]:
X, y, tokenizer = preprocess_text(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train,X_test)

[[   2    1    7 ...   15   10  116]
 [ 263   69   11 ...   37    1   15]
 [8085    2 7567 ...  790    1  766]
 ...
 [   4  382 8263 ...  903    2  356]
 [  11   87  288 ...   99  847  184]
 [  11  211    2 ...    2  165    5]] [[  87    4  121 ...  311  287   11]
 [ 106  178   24 ...    2  819  113]
 [  12   18   14 ...    5    2 1228]
 ...
 [  12   18   14 ...   57 1578    3]
 [  11  190    6 ...    2 8633   15]
 [ 695 4494  231 ...    2   56    3]]


In [16]:
if not os.path.exists("glove"):
    os.makedirs("glove")

In [17]:
# Download GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip -O glove/glove.6B.zip

--2025-02-19 15:44:50--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-02-19 15:44:50--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-02-19 15:44:51--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove/glove.6B.zip’

In [18]:
# Extract GloVe embeddings
!unzip -o glove/glove.6B.zip -d glove/


Archive:  glove/glove.6B.zip
  inflating: glove/glove.6B.50d.txt  
  inflating: glove/glove.6B.100d.txt  
  inflating: glove/glove.6B.200d.txt  
  inflating: glove/glove.6B.300d.txt  


In [20]:
# Verify the file exists
if os.path.exists("glove/glove.6B.100d.txt"):
    print("Yo Prada GloVe embeddings downloaded successfully!")
else:
    print("No Prada GloVe embeddings file missing!")

Yo Prada GloVe embeddings downloaded successfully!


In [21]:
# Load Pretrained GloVe Embeddings
def load_glove_embeddings():
    embeddings_index = {}
    glove_path = "glove/glove.6B.100d.txt"
    with open(glove_path, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

embeddings_index = load_glove_embeddings()
embedding_matrix = np.zeros((10000, 100))
for word, i in tokenizer.word_index.items():
    if i < 10000:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [23]:
# Build LSTM Model
model = Sequential([
    Embedding(10000, 100, weights=[embedding_matrix], input_length=100, trainable=False),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])




In [24]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)


Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step - accuracy: 0.6019 - loss: 0.6572 - val_accuracy: 0.7475 - val_loss: 0.5370
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - accuracy: 0.7615 - loss: 0.5067 - val_accuracy: 0.7979 - val_loss: 0.4530
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.7956 - loss: 0.4427 - val_accuracy: 0.8067 - val_loss: 0.4210
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.8183 - loss: 0.4032 - val_accuracy: 0.8151 - val_loss: 0.4043
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.8278 - loss: 0.3857 - val_accuracy: 0.8262 - val_loss: 0.3856


<keras.src.callbacks.history.History at 0x7fe4d63e3b10>

In [25]:
# Prediction Function
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=100, padding='post', truncating='post')
    prediction = model.predict(padded)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return sentiment, float(prediction)

In [26]:
# Gradio UI
demo = gr.Interface(fn=predict_sentiment, inputs="text", outputs=["text", "number"])

demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c827bf1bd419fe9602.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


