In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [8]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smitp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smitp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

In [10]:
# Load the dataset
data = pd.read_csv('IMDB Dataset.csv')

# Apply preprocessing to the review column
data['cleaned_review'] = data['review'].apply(preprocess_text)

# Split the dataset into features and target
X = data['cleaned_review']
y = data['sentiment']

# Convert sentiment labels to binary (1 for positive, 0 for negative)
y = y.map({'positive': 1, 'negative': 0})

# Vectorize the text data
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

In [11]:
# Train the Logistic Regression model
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

# Make predictions
y_pred_log = log_model.predict(X_test)

# Evaluate the model
accuracy_log = accuracy_score(y_test, y_pred_log)
f1_log = f1_score(y_test, y_pred_log)

print(f'Logistic Regression Accuracy: {accuracy_log:.2f}')
print(f'Logistic Regression F1 Score: {f1_log:.2f}')

Logistic Regression Accuracy: 0.89
Logistic Regression F1 Score: 0.89


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Testing on sample data
test_data = pd.DataFrame({
    'review': [
        "I did not hate the movie! It was very fantastic and thrilling.",
        "The movie was okay, but it didn't really stand out",
        "I hated the movie. It was a waste of time."
    ]
})

# Preprocess the test reviews
test_data['cleaned_review'] = test_data['review'].apply(preprocess_text)

# Vectorize the test data
X_test_vectorized = vectorizer.transform(test_data['cleaned_review'])

# Make predictions
y_test_pred = log_model.predict(X_test_vectorized)

# Map predictions to sentiment labels
sentiment_mapping = {1: 'positive', 0: 'negative'}
predicted_sentiments = [sentiment_mapping[pred] for pred in y_test_pred]

# Add predictions to the test data
test_data['predicted_sentiment'] = predicted_sentiments

# Display the results
print(test_data[['review', 'predicted_sentiment']])

                                              review predicted_sentiment
0  I did not hate the movie! It was very fantasti...            positive
1  The movie was okay, but it didn't really stand...            negative
2         I hated the movie. It was a waste of time.            negative
