In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, GRU
from tensorflow.keras.models import save_model
import warnings
warnings.filterwarnings('ignore')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df1 = pd.read_csv('/home/jupyter/shivam/out_work/Train.csv')
df2 = pd.read_csv('/home/jupyter/shivam/out_work/Test.csv')

In [4]:
df = pd.concat([df1,df2],axis=0)

In [5]:
df.shape

(45000, 2)

In [6]:
df.head(2)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0


In [7]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [8]:
def clean_text(text):
    # Lowercase text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back to string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

In [9]:
df['cleaned_text'] = df['text'].apply(clean_text)

In [10]:
df['cleaned_text'][0]

0    grew b 1965 watching loving thunderbirds mates...
0    always wrote series complete stinkfest jim bel...
Name: cleaned_text, dtype: object

In [11]:
texts = df['cleaned_text'].values
labels = df['label'].values

In [12]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.1, random_state=42)

In [13]:
# Tokenization
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

In [14]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [15]:
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

In [16]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

In [17]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
model.fit(X_train_pad, y_train, epochs=3, batch_size=64, validation_split=0.2)

Epoch 1/3
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 155ms/step - accuracy: 0.7532 - loss: 0.4769 - val_accuracy: 0.8790 - val_loss: 0.2937
Epoch 2/3
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 149ms/step - accuracy: 0.9213 - loss: 0.2051 - val_accuracy: 0.8744 - val_loss: 0.3005
Epoch 3/3
[1m507/507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 151ms/step - accuracy: 0.9518 - loss: 0.1320 - val_accuracy: 0.8632 - val_loss: 0.3453


<keras.src.callbacks.history.History at 0x7f1aa03ef370>

In [19]:
def analyze_sentiments(texts):
    # Clean texts
    cleaned_texts = [clean_text(text) for text in texts]
    
    # Preprocess texts
    text_seqs = tokenizer.texts_to_sequences(cleaned_texts)
    text_pads = pad_sequences(text_seqs, maxlen=100)
    
    # Predict sentiment for all texts
    predictions = model.predict(text_pads)
    
    # Interpret predictions
    sentiments = ['positive' if pred[0] > 0.5 else 'negative' for pred in predictions]
    scores = [pred[0] for pred in predictions]
    
    return sentiments, scores


In [22]:
# Apply batch processing to the DataFrame
texts = df_val['text'].tolist()

In [23]:
sentiments, scores = analyze_sentiments(texts)
dft1 = pd.DataFrame({'sentiment': sentiments, 'score': scores})

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 39ms/step


In [28]:
df_val.head(2)

Unnamed: 0,text,label,sentiment,score,prediction
0,It's been about 14 years since Sharon Stone aw...,0,negative,0.002972,0
1,someone needed to make a car payment... this i...,0,negative,0.001288,0


In [27]:
df_val['prediction'] = df_val.sentiment.map({'positive':1,'negative':0})

In [29]:
from sklearn.metrics import classification_report

In [30]:
# Generate the classification report
report = classification_report(df_val['label'], df_val['prediction'])

In [31]:
print(report)

              precision    recall  f1-score   support

           0       0.89      0.85      0.87      2486
           1       0.86      0.89      0.88      2514

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



In [21]:
df_val = pd.read_csv('/home/jupyter/shivam/out_work/Valid.csv')

In [43]:
dft.tail(3)

Unnamed: 0,text,label
4997,"You can take the crook out of the joint, but i...",1
4998,FUTZ is the only show preserved from the exper...,1
4999,"""The Mother"" tells of a recently widowed mid-6...",1


In [35]:
df_val = dft.tail(3)

In [38]:
df_val

Unnamed: 0,text,label,sentiment,score
4997,"You can take the crook out of the joint, but i...",1,positive,0.999078
4998,FUTZ is the only show preserved from the exper...,1,positive,0.997816
4999,"""The Mother"" tells of a recently widowed mid-6...",1,positive,0.999702


In [25]:
model.save('/home/jupyter/shivam/out_work/sentiment_model.h5')

