In [17]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from bs4 import BeautifulSoup
import string

In [18]:
# Step 1: Connect to the SQLite Database and Load Data
conn = sqlite3.connect('imdb_reviews.db')  # Connect to the database
query = "SELECT * FROM imdb_reviews"  # Query to fetch data from the table
df = pd.read_sql_query(query, conn)  # Load data into a Pandas DataFrame
conn.close()  # Close the connection

In [19]:
# Step 2: Prepare the Data for Training
X = df['review_text']  # Features: cleaned reviews
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)  # Target: Convert sentiment to binary

In [20]:
# Train-Test Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

In [21]:
# Fit and transform the training data, transform the test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [22]:
# Step 3: Train Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)

# Step 4: Evaluate the Model on the Test Set
y_pred = log_reg.predict(X_test_tfidf)


In [23]:
# Step 5: Metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.8850
Precision: 0.8739
Recall: 0.9020
F1-Score: 0.8877


In [24]:
import joblib

# Save the Logistic Regression model
joblib.dump(log_reg, 'logistic_regression_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']