In [1]:

# Import necessary libraries
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import sqlite3
import pandas as pd

In [2]:
# Step 1: Fetch Data from the Database
print("Fetching data from the database...")
conn = sqlite3.connect("imdb_reviews.db")
cursor = conn.cursor()

Fetching data from the database...


In [3]:
cursor.execute("SELECT review_text, sentiment FROM imdb_reviews WHERE data_split = 'train';")
train_data = cursor.fetchall()

cursor.execute("SELECT review_text, sentiment FROM imdb_reviews WHERE data_split = 'test';")
test_data = cursor.fetchall()

conn.close()

In [4]:
# Convert to DataFrame for easier handling
train_df = pd.DataFrame(train_data, columns=["review_text", "sentiment"])
test_df = pd.DataFrame(test_data, columns=["review_text", "sentiment"])


In [5]:
# Convert sentiment labels to binary: "positive" -> 1, "negative" -> 0
train_df["sentiment"] = train_df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)
test_df["sentiment"] = test_df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)

In [6]:
# Step 2: Create Train/Validation Split
print("Creating train/validation split...")
X_train, X_val, y_train, y_val = train_test_split(
    train_df["review_text"], train_df["sentiment"], test_size=0.2, random_state=42, stratify=train_df["sentiment"]
)

Creating train/validation split...


In [7]:
# Step 3: TF-IDF Vectorization
print("Vectorizing text data with TF-IDF...")
tfidf = TfidfVectorizer(max_features=10000, stop_words="english")  # Use the top 10,000 features
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(test_df["review_text"])

Vectorizing text data with TF-IDF...


In [8]:
# Step 4: Train Logistic Regression Model
print("Training Logistic Regression model...")
lr_model = LogisticRegression(random_state=42, max_iter=200)
lr_model.fit(X_train_tfidf, y_train)

Training Logistic Regression model...


In [9]:
# Step 5: Validate the Model
print("Validating the model...")
val_predictions = lr_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, val_predictions)
val_precision = precision_score(y_val, val_predictions)
val_recall = recall_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions)

Validating the model...


In [10]:
print("\nValidation Metrics:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1 Score: {val_f1:.4f}")


Validation Metrics:
Accuracy: 0.9121
Precision: 0.9051
Recall: 0.9208
F1 Score: 0.9129


In [11]:
# Step 6: Evaluate on the Test Set
print("Evaluating the model on the test set...")
test_predictions = lr_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(test_df["sentiment"], test_predictions)
test_precision = precision_score(test_df["sentiment"], test_predictions)
test_recall = recall_score(test_df["sentiment"], test_predictions)
test_f1 = f1_score(test_df["sentiment"], test_predictions)

print("\nTest Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")

Evaluating the model on the test set...

Test Metrics:
Accuracy: 0.8763
Precision: 0.8761
Recall: 0.8766
F1 Score: 0.8763


In [12]:
#  Detailed Classification Report
print("\nClassification Report on Test Set:")
print(classification_report(test_df["sentiment"], test_predictions, target_names=["Negative", "Positive"]))


Classification Report on Test Set:
              precision    recall  f1-score   support

    Negative       0.88      0.88      0.88     25000
    Positive       0.88      0.88      0.88     25000

    accuracy                           0.88     50000
   macro avg       0.88      0.88      0.88     50000
weighted avg       0.88      0.88      0.88     50000



In [13]:
import pickle

# Save the trained model
with open("logistic_regression_model.pkl", "wb") as model_file:
    pickle.dump(lr_model, model_file)

# Save the TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)
