In [10]:
import os
import boto3
from dotenv import find_dotenv, load_dotenv

path_ = find_dotenv()
load_dotenv(path_)

KEY_ID = os.getenv("KEY_ID")
ACCESS_KEY =os.getenv("ACCESS_KEY")
REGION = os.getenv("REGION")



S3 = boto3.resource(
    's3',
    aws_access_key_id=KEY_ID,
    aws_secret_access_key=ACCESS_KEY,
    region_name=REGION
)



In [11]:
import os 
bckt_nme = "imdb-dataset-11.8"  
file_name = "IMDB Dataset.csv"
PATH = r"M:\\MACHINE LEARNING\\github\\sentiment analysis"

os.makedirs(PATH, exist_ok=True)

if os.path.exists(os.path.join(PATH, file_name)):
    print("File already exists")
else:
    S3.Bucket(bckt_nme).download_file(file_name, os.path.join(PATH, file_name))


File already exists


In [None]:
import pandas as pd 
df =pd.read_csv(os.path.join(PATH,file_name))
print(f"total sample : {df.shape[0]}")
print(f"duplicated : {df.duplicated().sum()}")

df.drop_duplicates(inplace=True)
print(f"after adding total duplicates : {df.duplicated().sum()}")

shape: 50000
duplicated : 418
after adding total duplicates : 0


In [15]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['sentiment'] = encoder.fit_transform(df['sentiment'])
df['sentiment'].shape

(49582,)

In [17]:
def make_lower(text): 
    return text.lower()
from lxml import html

def remove_tags(text): 
    doc = html.fromstring(text) # separate the tags 
    return doc.text_content() # collect the text and shows it 
import re

def remove_url(text): 
    patters = re.compile(r'https?://\S+|www.\S+')
    return patters.sub(" ",text)
import string
puns = string.punctuation

def remove_puns(text): 
    return text.translate(str.maketrans('','',puns))

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    if not isinstance(text, str) or not text.strip():
        #isinstance(text, str) → True if text is a string
        # not text.strip() checks if the text is empty or only whitespace.
        # jodi string or space kno tai na hoi thle "" return korbe 
        return ""
    words = text.split()
    filtered = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered)

[nltk_data] Downloading package stopwords to C:\Users\Hp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
import nltk
for pkg in ['punkt', 'punkt_tab']:
    try:
        nltk.data.find(f'tokenizers/{pkg}')
    except LookupError:
        nltk.download(pkg)


In [19]:
import pandas as pd 
import spacy
from nltk.tokenize import sent_tokenize,word_tokenize



def preprocessor(text): 
    text = make_lower(text)
    text = remove_tags(text)
    text = remove_url(text)
    text = remove_puns(text)
    text = remove_stopwords(text)
    text = word_tokenize(text)

    return (text)

df['clean_text']=df['review'].apply(preprocessor)

In [48]:
import gensim
from gensim.utils import simple_preprocess
from gensim.models  import Word2Vec

words =df['clean_text'].tolist() # its mainly the nested list of clean text 
    # [ ['i','love','this','movie'],['too','bad'], ... ...]


if os.path.exists(r"M:\MACHINE LEARNING\github\sentiment analysis\word2vec_model.model"):
    word2vec_model = Word2Vec.load("M:\MACHINE LEARNING\github\sentiment analysis\word2vec_model.model")
    print("model loaded")
else: 
    word2vec_model= Word2Vec(
        vector_size=100,
        window=5,
        min_count=2,
        sg=1
    )
                                                    # this model just find out the relation of the words 
    word2vec_model.build_vocab(words)                        # and convert words into a 100d vector 

    word2vec_model.train(
        words,
        total_examples=df.shape[0],
        epochs=2
    )
    word2vec_model.save("word2vec_model.model")



model loaded


In [51]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer(num_words=50000,oov_token ="<OOV>")
# By default, it will:
# Convert text to lowercase,
# Split on spaces,
# Remove punctuation,
# Count word frequencies (when you fit it).

tokenizer.fit_on_texts([" ".join(token) for token in words]) # convert all the words into number value 

# Save tokenizer to JSON file
tokenizer_json = tokenizer.to_json()

with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)


seqnce = tokenizer.texts_to_sequences(words) # sentence replace with their word's number value 

In [28]:
# sb gula review k ebr same size kora hoche karon model input gula sb same size hoite hoi 

x = pad_sequences(
    seqnce,
    maxlen=200, # maximum size of each review 
    padding = 'pre', # 200 thke kom hole last e 0 add kore lenth 200 kora hbe
    truncating = 'post' # 200 thke beshi hoile last thke token delte kore lenth 200 kora hbe
)

y = df['sentiment']

In [29]:
import tensorflow as tf 
from keras.layers import *
from keras.models import Sequential

import numpy as np
vocab_size = min(50000,len(tokenizer.word_index)+1) # total number of token 
embedding_dim = 100 # each word is 100d vector 

embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:      # skip indices outside the embedding matrix
        continue
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

embedding_layer = tf.keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=100,
    weights=[embedding_matrix],
    input_length=200,
    trainable=False  # freeze if you don't want to fine-tune
)



In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


def create_model(embedding_layer):
    sentiment_model = Sequential([
        embedding_layer,
        LSTM(128, return_sequences=True),
        LSTM(128, return_sequences = False),
        Dropout(0.4),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    sentiment_model.compile(loss='binary_crossentropy', 
                            optimizer='adam',
                            metrics=['accuracy'])
    sentiment_model.summary()
    return sentiment_model


create_model(embedding_layer)


<Sequential name=sequential_1, built=False>

In [64]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [32]:
import tensorflow as tf
import os

# Folder where you want to save checkpoints
PATH = r"M:\MACHINE LEARNING\github\sentiment analysis"
os.makedirs(PATH, exist_ok=True)

checkpoint_dir = r"M:\MACHINE LEARNING\github\sentiment analysis\checkpoints"
# Filepath must end with .weights.h5 when save_weights_only=True
checkpoint_path = os.path.join(checkpoint_dir, "cp-{epoch:04d}.weights.h5")

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    verbose=1,
    save_freq='epoch'  # Save after every epoch
)

In [None]:
# sentiment_model.fit(x_train, 
#                     y_train, 
#                     epochs=10, 
#                     batch_size=128, 
#                     validation_split=0.2,
#                     callbacks=[cp_callback]
#                     )
# sentiment_model.save("new_model.h5")

In [None]:
from tensorflow import keras
import h5py
h5py.is_hdf5(r"M:\MACHINE LEARNING\github\sentiment analysis\new_model.h5")

In [None]:
test_model = keras.models.load_model(r"M:\MACHINE LEARNING\github\sentiment analysis\new_model.h5")

In [None]:
# Evaluate on test set

loss, acc = test_model.evaluate(x_test, y_test)
print(f"Test Loss: {loss * 100:.2f}%")
print(f"Test Accuracy: {acc * 100:.2f}%")

[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 127ms/step - accuracy: 0.8894 - loss: 0.2822
Test Loss: 28.22%
Test Accuracy: 88.94%


In [55]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Example sentence
text = ("great movie")

def test_case(text):
    # Convert it to sequence
    with open('tokenizer.json', 'r', encoding='utf-8') as f:
        data = f.read()
        tokenizer = tokenizer_from_json(data)

    seq = tokenizer.texts_to_sequences([text]) 
    # tokenizer object will create unique word list and assign them a unique value .
    #  texts_to_sequences this class will replace the word with their unique value . 

    # Pad to the same length as training data
    padded = pad_sequences(seq, maxlen=200, padding='pre') 
    # makes the input same size as training data input 

    # Predict
    pred = test_model.predict(padded)
    return pred

test_case(text)

if pred > 0.5:
    print("Positive Review ")
else:
    print("Negative Review ")





[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step
Negative Review 


In [58]:
!pip freeze > requirements.txt