In [1]:
import pandas as pd

df = pd.read_csv("C:/Users/a2z/OneDrive/Desktop/sentiment-analysis-project/movie_sentiment-analysis/data/processed/cleaned_data.csv")

print(df.head())
print(df['sentiment'].value_counts())


                                              review sentiment
0  one of the other reviewers has mentioned that ...  positive
1  a wonderful little production the filming tech...  positive
2  i thought this was a wonderful way to spend ti...  positive
3  basically there s a family where a little boy ...  negative
4  petter mattei s love in the time of money is a...  positive
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [2]:
df['sentiment'] = df['sentiment'].map({
    'negative': 0,
    'positive': 1
})

print(df['sentiment'].unique())
df.to_csv("C:/Users/a2z/OneDrive/Desktop/sentiment-analysis-project/movie_sentiment-analysis/data/processed/encoded_data.csv", index=False)
print("✅ Encoded data saved")

[1 0]
✅ Encoded data saved


In [3]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (40000,)
Test size: (10000,)


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),   # unigrams + bigrams
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)


(40000, 5000)
(10000, 5000)


In [5]:
import pickle
import os

os.makedirs("models", exist_ok=True)

with open("models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("TF-IDF Vectorizer saved successfully!")


TF-IDF Vectorizer saved successfully!


In [6]:
# Checking few feature names
print(tfidf.get_feature_names_out()[:20])


['abandoned' 'abc' 'abilities' 'ability' 'able' 'absence' 'absolute'
 'absolutely' 'absurd' 'abuse' 'abysmal' 'academy' 'academy award'
 'accent' 'accents' 'accept' 'acceptable' 'accepted' 'accident'
 'accidentally']


In [7]:
# Checking few feature names for specific words

features = tfidf.get_feature_names_out()

print("acting" in features)
print("excellent" in features)
print("bad movie" in features)
print("not good" in features)


True
True
True
False


In [8]:
import numpy as np

# Get average TF-IDF score per feature
avg_tfidf = np.asarray(X_train_tfidf.mean(axis=0)).ravel()

top_indices = avg_tfidf.argsort()[-20:][::-1]

top_words = [features[i] for i in top_indices]
print(top_words)


['movie', 'film', 'like', 'just', 'good', 'really', 'story', 'time', 'bad', 'great', 'people', 'don', 'movies', 'watch', 'think', 'make', 'seen', 'characters', 'way', 'acting']
