In [32]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [33]:
df = pd.read_csv("/content/drive/MyDrive/IMDB Dataset.csv")



In [34]:
label_encoder  = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])



In [35]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = label_encoder.fit_transform(df['review'])


In [36]:
final_df = pd.concat([pd.DataFrame(tfidf_matrix), df['sentiment']], axis = 1)



In [37]:
X = final_df.drop('sentiment', axis = 1)
y = final_df['sentiment']


In [38]:
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)



In [39]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)


In [40]:
y_pred = knn.predict(X_test)
print(y_pred)

[1 1 0 ... 0 1 1]


In [41]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5769


In [42]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.58      0.58      4961
           1       0.58      0.57      0.58      5039

    accuracy                           0.58     10000
   macro avg       0.58      0.58      0.58     10000
weighted avg       0.58      0.58      0.58     10000



In [43]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[2901 2060]
 [2171 2868]]


In [44]:
best_accuracy = 0
best_n_neighbors = 0

In [45]:
for n in range(1, 11):  # try neighbors from 1 to 10
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_n_neighbors = n

In [46]:
print("\nBest Number of Neighbors:", best_n_neighbors)
print("Best Accuracy:", best_accuracy)


Best Number of Neighbors: 3
Best Accuracy: 0.5837


In [70]:
custom_inputs = ["This movie was fantastic!", "I did not enjoy this movie at all."]


In [71]:

custom_tfidf_matrix = label_encoder.fit_transform(custom_inputs)

In [74]:
custom_tfidf_matrix1 = custom_tfidf_matrix.reshape(1, -1)

In [78]:
print(X_train.shape)

(40000, 1)


In [80]:
custom_tfidf_matrix1 = custom_tfidf_matrix[:, 0].reshape(1, -1)
print(custom_tfidf_matrix1.shape)

(1, 1)


In [81]:
custom_pred = knn.predict(custom_tfidf_matrix1)
print(custom_pred)

[0]
