In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load data
# Make sure the file 'Reddit_Data.csv' is uploaded to your Colab environment
# and the path is correct. If the file is in the root directory, the path is just 'Reddit_Data.csv'
# If it's in a subdirectory, specify the path like 'subdirectory/Reddit_Data.csv'
try:
    data = pd.read_csv('Reddit_Data.csv')
except FileNotFoundError:
    print("Error: 'Reddit_Data.csv' not found. Please upload the file or check the file path.")
    # You might want to exit or handle the error differently here
    # exit() # Removed exit() to allow the FileNotFoundError to be shown

# Drop rows with missing values in the 'category' column
data.dropna(subset=['category'], inplace=True)

# Example column names, change if different
texts = data['clean_comment'].astype(str).tolist()       # change 'text' to your text column name
labels = data['category'].tolist()              # change 'sentiment' to your label column name

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression classifier
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_tfidf, y_train)

# Predict on test set
y_pred = clf.predict(X_test_tfidf)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8626845637583893

Classification Report:
               precision    recall  f1-score   support

          -1       0.88      0.67      0.76      1667
           0       0.85      0.95      0.90      2615
           1       0.86      0.89      0.88      3168

    accuracy                           0.86      7450
   macro avg       0.87      0.84      0.85      7450
weighted avg       0.86      0.86      0.86      7450

