In [1]:
#importing libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Download NLTK resources (only once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Mounting Google Drive

In [2]:
#mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Reading Dataset

In [19]:
#reading dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [20]:
# sentiment = 1 (positive), 0 (negative)

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## Feature Processing

In [21]:
reviews = df['review']
labels = df['sentiment']

### **Step 1: Text Preprocessing Function**
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    return words  # Returning tokenized words

# Apply text preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

### **Step 2: Stemming and Lemmatization**
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(words):
    stemmed = [stemmer.stem(word) for word in words]  # Stemming
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]  # Lemmatization
    return " ".join(lemmatized)  # Join words back into a sentence

# Apply stemming and lemmatization
df['processed_review'] = df['cleaned_review'].apply(stem_and_lemmatize)

### **Step 3: Convert Text to Numerical Representation (TF-IDF)**
vectorizer = TfidfVectorizer(max_features=10000)  # Keeping top 5000 important words
X = vectorizer.fit_transform(df['processed_review'])  # Converting text to numbers


In [22]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review,processed_review
0,One of the other reviewers has mentioned that ...,1,"[one, reviewers, mentioned, watching, oz, epis...",one review mention watch oz episod youll hook ...
1,A wonderful little production. <br /><br />The...,1,"[wonderful, little, production, br, br, filmin...",wonder littl product br br film techniqu unass...
2,I thought this was a wonderful way to spend ti...,1,"[thought, wonderful, way, spend, time, hot, su...",thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,0,"[basically, theres, family, little, boy, jake,...",basic there famili littl boy jake think there ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"[petter, matteis, love, time, money, visually,...",petter mattei love time money visual stun film...


## Model Building

In [23]:
## **Step 4: Train-Test Split (80% Training, 20% Testing)**
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

#appying randomforest classifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

### **Step 6: Predictions and Evaluation**
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

### **Step 7: Test on Sample Reviews**
sample_reviews = ["I absolutely loved this movie! The storyline was amazing.",
                  "This was the worst movie I have ever seen. Terrible acting."]
sample_transformed = vectorizer.transform(sample_reviews)  # Convert text to numerical format
predictions = model.predict(sample_transformed)

# Display results
for review, pred in zip(sample_reviews, predictions):
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"Review: {review} -> Sentiment: {sentiment}")


Model Accuracy: 0.85
Review: I absolutely loved this movie! The storyline was amazing. -> Sentiment: Positive
Review: This was the worst movie I have ever seen. Terrible acting. -> Sentiment: Negative


## End