In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('test.csv')

In [3]:
df.head()

Unnamed: 0,text,sentiment
0,"My daughter liked it but I was aghast, that a ...",neg
1,I... No words. No words can describe this. I w...,neg
2,this film is basically a poor take on the old ...,neg
3,"This is a terrible movie, and I'm not even sur...",neg
4,First of all this movie is a piece of reality ...,pos


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       25000 non-null  object
 1   sentiment  25000 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB


In [5]:
df.shape

(25000, 2)

In [6]:
df.isnull().sum()

Unnamed: 0,0
text,0
sentiment,0


In [7]:
df['sentiment'].unique()

array(['neg', 'pos'], dtype=object)

In [8]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neg,12500
pos,12500


### Data Preprocessing

In [9]:
# prompt: apply label encoding on sentiment colunm

from sklearn.preprocessing import LabelEncoder
import pickle

le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])
df['sentiment'].value_counts()



with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)



In [10]:
df['text'] = df['text'].str.lower()

In [11]:
# prompt: do data splitting code

from sklearn.model_selection import train_test_split

X = df['text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [12]:

from sklearn.preprocessing import LabelEncoder
import re
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Removing special characters, URLs, HTML tags, and extra spaces from both train and test sets
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
    text = re.sub(r'(http|https|ftp)://[a-zA-Z0-9./]+', '', text)
    text = BeautifulSoup(text, 'lxml').get_text()
    text = " ".join(text.split())
    return text

X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

# Remove stopwords from both train and test sets AFTER splitting
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

X_train = X_train.apply(remove_stopwords)
X_test = X_test.apply(remove_stopwords)


print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


X_train shape: (20000,)
y_train shape: (20000,)
X_test shape: (5000,)
y_test shape: (5000,)


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=3000, stop_words='english',ngram_range=(1, 2)) # Limiting features for demonstration

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data (using the vocabulary learned from the training data)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"X_test_tfidf shape: {X_test_tfidf.shape}")


# Save vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

X_train_tfidf shape: (20000, 3000)
X_test_tfidf shape: (5000, 3000)


In [19]:

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

models = {
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression()
}

for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)

    y_train_pred = model.predict(X_train_tfidf)
    y_test_pred = model.predict(X_test_tfidf)

    # Training Set Performance
    acc_score = accuracy_score(y_train, y_train_pred)
    f1 = f1_score(y_train, y_train_pred, average='weighted')
    pr = precision_score(y_train, y_train_pred, average='weighted')
    rs = recall_score(y_train, y_train_pred, average='weighted')
    ras = roc_auc_score(y_train, y_train_pred, average='weighted')

    # Test Set Performance
    acc_score1 = accuracy_score(y_test, y_test_pred)
    f11 = f1_score(y_test, y_test_pred, average='weighted')
    pr1 = precision_score(y_test, y_test_pred, average='weighted')
    rs1 = recall_score(y_test, y_test_pred, average='weighted')
    ras1 = roc_auc_score(y_test, y_test_pred, average='weighted')

    print(model_name)

    print('Training Set Performance')
    print('accuracy_score {:.4f}'.format(acc_score))
    print('f1_score {:.4f}'.format(f1))
    print('precision_score {:.4f}'.format(pr))
    print('recall_score {:.4f}'.format(rs))
    print('roc_auc_score {:.4f}'.format(ras))

    print('--------------------------')

    print('Test Set Performance')
    print('accuracy_score {:.4f}'.format(acc_score1))
    print('f1_score {:.4f}'.format(f11))
    print('precision_score {:.4f}'.format(pr1))
    print('recall_score {:.4f}'.format(rs1))
    print('roc_auc_score {:.4f}'.format(ras1))

    print('=' * 35)
    print('\n')


import pickle

for model_name, model in models.items():
    # Save the trained model
    filename = f'{model_name.lower().replace(" ", "_")}_model.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to: {filename}")

SVM
Training Set Performance
accuracy_score 0.9828
f1_score 0.9827
precision_score 0.9828
recall_score 0.9828
roc_auc_score 0.9827
--------------------------
Test Set Performance
accuracy_score 0.8812
f1_score 0.8811
precision_score 0.8820
recall_score 0.8812
roc_auc_score 0.8812


Logistic Regression
Training Set Performance
accuracy_score 0.9126
f1_score 0.9126
precision_score 0.9128
recall_score 0.9126
roc_auc_score 0.9126
--------------------------
Test Set Performance
accuracy_score 0.8838
f1_score 0.8838
precision_score 0.8844
recall_score 0.8838
roc_auc_score 0.8838


Model saved to: svm_model.pkl
Model saved to: logistic_regression_model.pkl


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pickle

# Extended parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Grid Search
grid_search = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit
print("üîç Tuning Logistic Regression...")
grid_search.fit(X_train_tfidf, y_train)

# Best model
best_model = grid_search.best_estimator_
print("‚úÖ Best Params:", grid_search.best_params_)

# Evaluate
y_pred = best_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("üéØ Test Accuracy after tuning:", accuracy)

# Save
with open("best_lr_model.pkl", "wb") as f:
    pickle.dump(best_model, f)


üîç Tuning Logistic Regression...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
‚úÖ Best Params: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
üéØ Test Accuracy after tuning: 0.8832


In [None]:
# import streamlit as st
# import pickle
# import re
# from bs4 import BeautifulSoup
# from nltk.corpus import stopwords
# import nltk

# # Download stopwords
# nltk.download('stopwords')

# # Load resources
# with open("logistic_regression_model.pkl", "rb") as model_file:
#     model = pickle.load(model_file)

# with open("tfidf_vectorizer.pkl", "rb") as vec_file:
#     vectorizer = pickle.load(vec_file)

# with open("label_encoder.pkl", "rb") as le_file:
#     label_encoder = pickle.load(le_file)

# # Preprocessing function
# def preprocess(text):
#     text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
#     text = re.sub(r'(http|https|ftp)://[a-zA-Z0-9./]+', '', text)
#     text = BeautifulSoup(text, 'lxml').get_text()
#     text = " ".join(text.split())
#     text = " ".join([word for word in text.lower().split() if word not in stopwords.words('english')])
#     return text

# # App UI config
# st.set_page_config(page_title="üé¨ Movie Sentiment Analyzer", layout="centered")

# st.markdown(
#     """
#     <style>
#     .main {
#         background-color: #f9f9f9;
#         font-family: 'Segoe UI', sans-serif;
#     }
#     .title {
#         color: #1f77b4;
#         text-align: center;
#     }
#     .footer {
#         text-align: center;
#         font-size: 12px;
#         color: #888;
#         margin-top: 50px;
#     }
#     </style>
#     """,
#     unsafe_allow_html=True
# )

# # Title
# st.markdown("<h1 class='title'>üé• Movie Review Sentiment Analyzer</h1>", unsafe_allow_html=True)
# st.write("Write a review for your favorite movie and check if the sentiment is **Positive** or **Negative**!")

# # Movie selection
# movies = [
#     "Inception", "Titanic", "Interstellar", "The Godfather", "The Dark Knight",
#     "Forrest Gump", "The Shawshank Redemption", "Fight Club", "Avengers: Endgame", "Joker"
# ]
# selected_movie = st.selectbox("üé¨ Select a Movie", movies)

# # Review input
# user_review = st.text_area(f"üìù Write your review for *{selected_movie}*", height=200)

# # Predict button
# if st.button("üîç Analyze Sentiment"):
#     if user_review.strip() == "":
#         st.warning("üö® Please enter a review before analyzing.")
#     else:
#         cleaned_review = preprocess(user_review)
#         vectorized_review = vectorizer.transform([cleaned_review])
#         prediction_encoded = model.predict(vectorized_review)[0]
#         prediction_label = label_encoder.inverse_transform([prediction_encoded])[0] if hasattr(label_encoder, "inverse_transform") else prediction_encoded

#         if prediction_label == 'pos':
#             st.success("‚úÖ Positive Sentiment! You seem to have liked the movie. üéâ")
#         else:
#             st.error("‚ùå Negative Sentiment! You didn‚Äôt enjoy the movie much. üò¢")

# # Footer
# st.markdown("<div class='footer'>Made with ‚ù§Ô∏è using Streamlit</div>", unsafe_allow_html=True)
