In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r'C:\Users\Riya\Downloads\imdb_sample.csv')

# Display the first few rows of the dataset
print(df.head())


  label                                               text
0   pos  This movie was fantastic! The performances wer...
1   neg        I hated this movie. It was a waste of time.
2   pos      An excellent film with a brilliant storyline.
3   neg            Terrible movie. The plot made no sense.
4   pos           Loved the cinematography and the script.


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Display the first few rows of the cleaned dataset
print(df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Riya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Riya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Riya\AppData\Roaming\nltk_data...


  label                                               text  \
0   pos  This movie was fantastic! The performances wer...   
1   neg        I hated this movie. It was a waste of time.   
2   pos      An excellent film with a brilliant storyline.   
3   neg            Terrible movie. The plot made no sense.   
4   pos           Loved the cinematography and the script.   

                              cleaned_text  
0  movie fantastic performance oscarworthy  
1                   hated movie waste time  
2       excellent film brilliant storyline  
3           terrible movie plot made sense  
4              loved cinematography script  


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text data
X = vectorizer.fit_transform(df['cleaned_text'])

# Convert the labels to binary (1 for positive, 0 for negative)
y = df['label'].apply(lambda x: 1 if x == 'pos' else 0)

# Display the shape of the feature matrix
print(X.shape)


(50, 111)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Naive Bayes classifier
nb = MultinomialNB()

# Train the model
nb.fit(X_train, y_train)

# Predict on the test set
y_pred = nb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')
print(f'Confusion Matrix:\n{conf_matrix}')


Accuracy: 0.7333333333333333
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         8
           1       0.64      1.00      0.78         7

    accuracy                           0.73        15
   macro avg       0.82      0.75      0.72        15
weighted avg       0.83      0.73      0.72        15

Confusion Matrix:
[[4 4]
 [0 7]]
