# 1. Data Collection


In [12]:
import pandas as pd

# Load the dataset
data = pd.read_csv('IMDB Dataset.csv')

# Display the first few rows of the dataset
print(data.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


# 2: Data Preprocessing


In [13]:
import nltk
import string
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

# Define a function to clean the text
def clean_review(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply the cleaning function to the review column
data['review'] = data['review'].apply(clean_review)

# Display the cleaned data
print(data.head())


[nltk_data] Downloading package stopwords to C:\Users\SREHANSU
[nltk_data]     BARIK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review sentiment
0  one reviewers mentioned watching 1 oz episode ...  positive
1  wonderful little production br br filming tech...  positive
2  thought wonderful way spend time hot summer we...  positive
3  basically theres family little boy jake thinks...  negative
4  petter matteis love time money visually stunni...  positive


# 3: Feature Extraction


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the review text data
X = tfidf.fit_transform(data['review'])

# Display the shape of the transformed data
print(X.shape)


(50000, 5000)


# 4: Model Building


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, data['sentiment'], test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)


# 5: Evaluation

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred, pos_label='positive')

# Calculate recall
recall = recall_score(y_test, y_pred, pos_label='positive')

# Calculate F1-score
f1 = f1_score(y_test, y_pred, pos_label='positive')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')


Accuracy: 0.8878
Precision: 0.8787468574743763
Recall: 0.9017662234570352
F1-score: 0.8901077375122429
