In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv('movie_reviews.csv')
data.head()


Unnamed: 0,review,sentiment
0,Terrible plot and awful acting. I regret watch...,negative
1,"The movie was okay, but it didn't really stand...",neutral
2,I absolutely loved this movie! The story was e...,positive
3,One of the best movies I've seen this year! Tr...,positive
4,The characters were flat and uninteresting. Ve...,negative


In [3]:
import nltk

nltk.download('punkt')      # For word tokenization
nltk.download('stopwords')  # For stop words
nltk.download('punkt_tab')   # Additional punkt resource



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smitp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smitp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\smitp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download the NLTK stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess the text
def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

# Apply preprocessing to the review column
data['cleaned_review'] = data['review'].apply(preprocess_text)
data.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smitp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smitp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,sentiment,cleaned_review
0,Terrible plot and awful acting. I regret watch...,negative,terrible plot awful acting regret watching
1,"The movie was okay, but it didn't really stand...",neutral,movie okay really stand
2,I absolutely loved this movie! The story was e...,positive,absolutely loved movie story engaging acting s...
3,One of the best movies I've seen this year! Tr...,positive,one best movies seen year truly inspiring emot...
4,The characters were flat and uninteresting. Ve...,negative,characters flat uninteresting forgettable


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Split the dataset into training and testing sets
X = data['cleaned_review']
y = data['sentiment']

# Convert labels to numerical format
y = y.map({'positive': 1, 'negative': 0, 'neutral': 0.5})

# Vectorize the text data
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd

# Assuming y_train contains continuous scores, convert to binary labels (0 or 1)
threshold = 0.75  # You can adjust this threshold based on your use case
y_train = (y_train > threshold).astype(int)  # Convert continuous scores to binary labels
y_test = (y_test > threshold).astype(int)    # Do the same for the test set

# Train the Logistic Regression model
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

# Make predictions
y_pred_log = log_model.predict(X_test)

# Evaluate the model
accuracy_log = accuracy_score(y_test, y_pred_log)
f1_log = f1_score(y_test, y_pred_log, average='weighted')

print(f'Logistic Regression Accuracy: {accuracy_log:.2f}')
print(f'Logistic Regression F1 Score: {f1_log:.2f}')


Logistic Regression Accuracy: 0.67
Logistic Regression F1 Score: 0.54


In [7]:
# Sample test data
# Create a new DataFrame for testing (replace with your actual test data)
test_data = pd.DataFrame({
    'review': [
        "I loved the movie! It was fantastic and thrilling.",
        "The movie was okay, but it didn't really stand out",
        "I hated the movie. It was a waste of time."
    ]
})

# Preprocess the test reviews
test_data['cleaned_review'] = test_data['review'].apply(preprocess_text)

# Vectorize the test data
X_test_vectorized = vectorizer.transform(test_data['cleaned_review'])

# Make predictions
y_test_pred = log_model.predict(X_test_vectorized)

# Map predictions to sentiment labels
sentiment_mapping = {1: 'positive', 0: 'negative', 0.5: 'neutral'}
predicted_sentiments = [sentiment_mapping[pred] for pred in y_test_pred]

# Add predictions to the test data
test_data['predicted_sentiment'] = predicted_sentiments

# Display the results
print(test_data[['review', 'predicted_sentiment']])


                                              review predicted_sentiment
0  I loved the movie! It was fantastic and thrill...            positive
1  The movie was okay, but it didn't really stand...            negative
2         I hated the movie. It was a waste of time.            negative
