In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [11]:
# Load the IMDB dataset (replace 'IMDB Dataset.csv' with the actual path)
data = pd.read_csv('IMDB Dataset.csv')

In [12]:
data


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
495,"""American Nightmare"" is officially tied, in my...",negative
496,"First off, I have to say that I loved the book...",negative
497,This movie was extremely boring. I only laughe...,negative
498,I was disgusted by this movie. No it wasn't be...,negative


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     500 non-null    object
 1   sentiment  500 non-null    object
dtypes: object(2)
memory usage: 7.9+ KB


In [14]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [15]:
data.tail()

Unnamed: 0,review,sentiment
495,"""American Nightmare"" is officially tied, in my...",negative
496,"First off, I have to say that I loved the book...",negative
497,This movie was extremely boring. I only laughe...,negative
498,I was disgusted by this movie. No it wasn't be...,negative
499,Such a joyous world has been created for us in...,positive


In [16]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [17]:
data['sentiment'].value_counts()

negative    263
positive    237
Name: sentiment, dtype: int64

In [18]:
# Preprocessing the data (removing the stopwords)
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  tokens = word_tokenize(text)
  filtered_text = [word for word in tokens if word.lower() not in stop_words]
  return ' '.join(filtered_text)

data['review'] = data['review'].apply(remove_stopwords)

In [19]:
# Preprocess the text data 
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_text = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_text)

data['review'] = data['review'].apply(preprocess_text)

In [20]:
# Creating a list of positive reviews
positive_reviews = data[data['sentiment'] == 'positive']['review'].tolist()

# Joining all the positive reviews into a single string
all_positive_text = ' '.join(positive_reviews)

# Splitting the string into a list of words
words_in_positive_reviews = all_positive_text.split()

# Printing the list of words
print(words_in_positive_reviews)



In [21]:
word_counts = Counter(words_in_positive_reviews)

In [22]:
# Creating a list of negative reviews
negative_reviews = data[data['sentiment'] == 'negative']

# Combine all negative reviews into a single string
all_negative_reviews_text = ' '.join(negative_reviews['review'])

# Split the string into a list of words
words_in_negative_reviews = all_negative_reviews_text.split()

# Print the list of words
print(words_in_negative_reviews)



In [23]:
word_counts = Counter(words_in_negative_reviews)

In [24]:
# Convert text data to numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['review'])

In [28]:
import matplotlib
matplotlib.use('TkAgg')  # or another interactive backend like 'Qt5Agg'
import matplotlib.pyplot as plt

# Determine the optimal number of clusters using the elbow method
inertia = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Plot the elbow method graph
plt.plot(range(1, 11), inertia, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

In [29]:
# Apply K-Means clustering with 2 clusters (positive and negative)
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
data['predicted_cluster'] = kmeans.fit_predict(X)

In [30]:
# Map clusters to sentiment labels (assuming cluster 0 is negative and cluster 1 is positive)
mapping = {0: 'negative', 1: 'positive'}
data['predicted_sentiment'] = data['predicted_cluster'].map(mapping)

In [31]:
# Evaluate the model using accuracy score
accuracy = accuracy_score(data['sentiment'], data['predicted_sentiment'])
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.58


In [32]:
# Analyze the clusters (optional)
for i in range(num_clusters):
    cluster_words = vectorizer.get_feature_names_out()[data[data['predicted_cluster'] == i].index]
    print(f"Top words in cluster {i}: {', '.join(cluster_words[:10])}")

Top words in cluster 0: 12, 16, 177, 1800s, 18th, 19, 1908, 1914, 1920, 1931
Top words in cluster 1: 007, 10, 100, 1000, 101, 102, 103, 105, 11, 13


In [33]:
# Function to predict sentiment of new reviews
def predict_sentiment(new_reviews):
    # Preprocess the new reviews
    preprocessed_reviews = [preprocess_text(review) for review in new_reviews]

    # Transform the preprocessed reviews using the trained vectorizer
    new_X = vectorizer.transform(preprocessed_reviews)

    # Predict the clusters for the new reviews
    predicted_clusters = kmeans.predict(new_X)

    # Map the predicted clusters to sentiment labels
    predicted_sentiments = [mapping[cluster] for cluster in predicted_clusters]

    return predicted_sentiments

# Example usage:
new_reviews = [
    "This movie was absolutely terrible. The acting was awful, the plot was nonsensical, and I wasted two hours of my life.",
    "I loved this movie! It was so funny and heartwarming. The characters were relatable, and the story kept me engaged from beginning to end."
]

predicted_sentiments = predict_sentiment(new_reviews)
for review, sentiment in zip(new_reviews, predicted_sentiments):
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n")

# Evaluate the model using accuracy score
accuracy = accuracy_score(data['sentiment'], data['predicted_sentiment'])
print(f"Accuracy Score: {accuracy}")


Review: This movie was absolutely terrible. The acting was awful, the plot was nonsensical, and I wasted two hours of my life.
Predicted Sentiment: negative

Review: I loved this movie! It was so funny and heartwarming. The characters were relatable, and the story kept me engaged from beginning to end.
Predicted Sentiment: negative

Accuracy Score: 0.58
