In [1]:

!pip install scikit-learn nltk

import nltk
from nltk.corpus import movie_reviews
import random
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


nltk.download("movie_reviews")


documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]


random.shuffle(documents)


data = []
for words, label in documents:
    data.append((" ".join(words), label))

df = pd.DataFrame(data, columns=["text", "label"])
print(df.head())


vectorizer = CountVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df["text"])
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)

print("🔹 Accuracy:", accuracy_score(y_test, y_pred))
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))




[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


                                                text label
0  is jimmy stewart the greatest actor of all - t...   pos
1  battlefield earth is the worst film of 2000 , ...   neg
2  this independent film written and directed by ...   neg
3  synopsis : lifelong friends rafe ( affleck ) a...   neg
4  bats is this year ' s camp flick . with the wo...   neg
🔹 Accuracy: 0.85

🔹 Classification Report:
               precision    recall  f1-score   support

         neg       0.84      0.86      0.85       197
         pos       0.86      0.84      0.85       203

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



In [2]:
print("Data types of columns:")
print(df.dtypes)

print("\nMissing values per column:")
print(df.isnull().sum())

Data types of columns:
text     object
label    object
dtype: object

Missing values per column:
text     0
label    0
dtype: int64
