In [5]:
import pandas as pd
import numpy as np

# For text cleaning
import re
import string

# For vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# For splitting data
from sklearn.model_selection import train_test_split

# For KNN
from sklearn.neighbors import KNeighborsClassifier

# For evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset from the same directory
df = pd.read_csv('sentiment_data.csv')  # just the filename

# Check the columns
print("Columns in dataset:", df.columns)

# Detect columns
if 'Comment' in df.columns:
    text_column = 'Comment'
else:
    text_column = df.columns[0]  # fallback: first column

if 'Sentiment' in df.columns:
    label_column = 'Sentiment'
else:
    label_column = df.columns[1]  # fallback: second column

print(f"Using text column: {text_column}")
print(f"Using label column: {label_column}")

# Clean the text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    return text

df['clean_text'] = df[text_column].apply(clean_text)

# Vectorize
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df[label_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict
y_pred = knn.predict(X_test)

# Evaluate
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Columns in dataset: Index(['Unnamed: 0', 'Comment', 'Sentiment'], dtype='object')
Using text column: Comment
Using label column: Sentiment

Accuracy: 0.26

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       104
           1       0.24      1.00      0.39       144
           2       1.00      0.03      0.07       352

    accuracy                           0.26       600
   macro avg       0.41      0.34      0.15       600
weighted avg       0.65      0.26      0.13       600


Confusion Matrix:
 [[  0 104   0]
 [  0 144   0]
 [  0 340  12]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
