#Libraries

In [None]:
!pip install nlpaug
!pip install tqdm
!pip install time

from tqdm import tqdm
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.corpus import stopwords

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11
[31mERROR: Could not find a version that satisfies the requirement time (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for time[0m[31m
[0m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# Loading Dataset

In [None]:
# Importing Dataset containg 50K reviews

imdb_data=pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [None]:
imdb_data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [None]:
imdb_data['sentiment'].value_counts()


sentiment
positive    25000
negative    25000
Name: count, dtype: int64

#Creating class imbalance to replicate the real-world scenarios

In [None]:
# Separate positive and negative reviews
positive_reviews = imdb_data[imdb_data['sentiment'] == 'positive'] # Check if 'positive' is used instead of 1
negative_reviews = imdb_data[imdb_data['sentiment'] == 'negative'] # Check if 'negative' is used instead of 0

# Ensure there are enough reviews in the dataset
if len(positive_reviews) < 10000 or len(negative_reviews) < 20000:
    raise ValueError("Not enough positive or negative reviews in the dataset.")

# Randomly select 10K positive reviews and 20K negative reviews for the training set
train_positive_reviews = positive_reviews.sample(n=10000, random_state=42)
train_negative_reviews = negative_reviews.sample(n=20000, random_state=42)

# Create the training set
train_data = pd.concat([train_positive_reviews, train_negative_reviews])

# Remove the selected reviews from the original data to form the test set
remaining_positive_reviews = positive_reviews.drop(train_positive_reviews.index)
remaining_negative_reviews = negative_reviews.drop(train_negative_reviews.index)

# Combine the remaining reviews to form the test set
test_data = pd.concat([remaining_positive_reviews, remaining_negative_reviews])

# Shuffle the training and test sets
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
test_data = test_data.sample(frac=1, random_state=42).reset_index(drop=True)
train_data.to_csv('imdb_reviews_train.csv', index=False)
test_data.to_csv('imdb_reviews_test.csv', index=False)

In [None]:
print(train_data['sentiment'].shape)


(30000,)


In [None]:
print(train_positive_reviews.shape)
print(train_negative_reviews.shape)

(10000, 2)
(20000, 2)


# Pre Processing

In [None]:
from tqdm import tqdm
import time
import re,string

train_data = pd.read_csv('imdb_reviews_train.csv')
from nltk.stem import WordNetLemmatizer

# Define a function for text preprocessing
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Convert to lower case
    tokens = [word.lower() for word in tokens]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing to the review column
train_data['review'] = train_data['review'].apply(preprocess_text)
test_data['review'] = test_data['review'].apply(preprocess_text)

# test augmentation

In [None]:
import pandas as pd
import nlpaug.augmenter.word as naw

# Load the training dataset


# Check if the dataset is loaded correctly
print(f"Total reviews loaded: {len(train_data)}")

# Separate positive and negative reviews
positive_reviews = train_data[train_data['sentiment'] == 'positive']
negative_reviews = train_data[train_data['sentiment'] == 'negative']

# Check the number of positive and negative reviews
print(f"Positive reviews: {len(positive_reviews)}")
print(f"Negative reviews: {len(negative_reviews)}")

# Define the augmenter
aug = naw.SynonymAug(aug_src='wordnet')

# Function to augment text
def augment_text(text, augmenter, num_aug=1):
    augmented_texts = []
    for _ in range(num_aug):
        augmented_text = augmenter.augment(text)
        augmented_texts.append(augmented_text)
    return augmented_texts

# Number of positive reviews needed to match the number of negative reviews
num_positive_reviews_needed = len(negative_reviews) - len(positive_reviews)
print(f"Number of positive reviews needed: {num_positive_reviews_needed}")

# Augment the positive reviews
augmented_reviews = []
for i, row in positive_reviews.iterrows():
    if len(augmented_reviews) >= num_positive_reviews_needed:
        break
    augmented_texts = augment_text(row['review'], aug , num_aug=1)
    for augmented_text in augmented_texts:
        augmented_reviews.append({'review': augmented_text, 'sentiment': 'positive'})

# Convert augmented reviews to DataFrame
augmented_reviews_df = pd.DataFrame(augmented_reviews)

# Check if augmentation worked correctly
print(f"Number of augmented reviews: {len(augmented_reviews_df)}")

# Combine the original and augmented positive reviews
positive_reviews_augmented = pd.concat([positive_reviews, augmented_reviews_df])

# Check the number of augmented positive reviews
print(f"Total positive reviews after augmentation: {len(positive_reviews_augmented)}")

# Combine with negative reviews to form the final training dataset
train_data_balanced = pd.concat([positive_reviews_augmented, negative_reviews])

# Check the number of reviews in the balanced dataset
print(f"Total reviews in the balanced dataset: {len(train_data_balanced)}")

# Shuffle the training set to mix the reviews
train_data_balanced = train_data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced training dataset to a new CSV file
train_data_balanced.to_csv('imdb_reviews_train_balanced.csv', index=False)

print("Training dataset balanced and saved as 'imdb_reviews_train_balanced.csv'")


Total reviews loaded: 30000
Positive reviews: 10000
Negative reviews: 20000
Number of positive reviews needed: 10000


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Number of augmented reviews: 10000
Total positive reviews after augmentation: 20000
Total reviews in the balanced dataset: 40000
Training dataset balanced and saved as 'imdb_reviews_train_balanced.csv'


In [None]:
train_data_balanced.shape
unique_labels_train = np.unique(train_data_balanced['sentiment'])
print("Unique labels in augmented_table:", unique_labels_train)

Unique labels in augmented_table: ['negative' 'positive']


#Model training

**TF-IDF**   It is used to convert text documents to matrix of tfidf features.

In [None]:
# Check if any reviews are lists
list_reviews = train_data_balanced[train_data_balanced['review'].apply(type) == list]
print(f"Number of reviews that are lists: {len(list_reviews)}")

# Convert the 'sentiment' column to a consistent data type (string) before encoding
train_data_balanced['sentiment'] = train_data_balanced['sentiment'].astype(str)



# If there are list reviews, convert them to strings
if len(list_reviews) > 0:
    train_data_balanced['review'] = train_data_balanced['review'].apply(lambda x: ' '.join(x) if type(x) == list else x)


from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


reviews = train_data_balanced['review']
labels = train_data_balanced['sentiment']

reviews_test = test_data['review']
labels_test = test_data['sentiment']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features as needed

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(reviews)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(reviews_test)

# Check the shape of the resulting TF-IDF matrices
print(f"Training data shape: {X_train_tfidf.shape}")
print(f"Test data shape: {X_test_tfidf.shape}")


# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the training labels
labels_encoded = label_encoder.fit_transform(labels)

# Transform the test labels
labels_test_encoded = label_encoder.transform(test_data['sentiment'].astype(str))



model = LogisticRegression(solver='liblinear')
model.fit(X_train_tfidf, labels_encoded)

# Evaluate the model on the test set
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(labels_test_encoded, predictions)
print(f"Test accuracy: {accuracy}")








Number of reviews that are lists: 10000
Training data shape: (40000, 5000)
Test data shape: (20000, 5000)
Test accuracy: 0.8729


In [None]:
import numpy as np

# Assuming predictions and labels_test are defined as numpy arrays or pandas series

unique_labels_predictions = np.unique(predictions)
unique_labels_test = np.unique(labels_test)
unique_labels_train = np.unique(labels_encoded)
print("Unique labels in unique_labels_train:", unique_labels_train)
print("Unique labels in predictions:", unique_labels_predictions)
print("Unique labels in test data:", unique_labels_test)

Unique labels in unique_labels_train: [0 1]
Unique labels in predictions: [0 1]
Unique labels in test data: ['negative' 'positive']


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay,classification_report
import matplotlib.pyplot as plt

lr_tfidf_report=classification_report(labels_test_encoded, predictions,target_names=['Positive','Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.69      0.89      0.78      5000
    Negative       0.96      0.87      0.91     15000

    accuracy                           0.87     20000
   macro avg       0.83      0.88      0.84     20000
weighted avg       0.89      0.87      0.88     20000

