In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/labeled_comments.csv')

In [3]:
df = df.dropna()
df = df[df['label']!= 'neutral']
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
negative,9168
positive,1457


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import os

# Specify the directory for NLTK data
nltk_data_dir = os.path.join(os.getcwd(), "nltk_data")

# Ensure the directory exists
os.makedirs(nltk_data_dir, exist_ok=True)

# Append the directory containing NLTK data to the search path before downloading
nltk.data.path.append(nltk_data_dir)

# Download 'punkt' and other necessary resources to the specified directory
nltk.download('punkt_tab', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('omw-1.4', download_dir=nltk_data_dir)


# Assuming your DataFrame is named 'df' and has columns 'comment' and 'label'

# 1. Preprocessing with NLTK
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        tokens = nltk.word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return ' '.join(tokens)
    else:
        return ""

df['comment'] = df['comment'].apply(preprocess_text)

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

# 3. Convert text data to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

[nltk_data] Downloading package punkt_tab to /content/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /content/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /content/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Assuming you have your data in X_train_tfidf, X_test_tfidf, y_train, y_test

# Define a list of models to compare
models = [
    ('SVM', SVC(kernel='linear', C=1)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42)),
    ('Multinomial Naive Bayes', MultinomialNB())
]

# Train and evaluate each model
results = []
for name, model in models:
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)

    results.append((name, accuracy, report))

# Print the results
for name, accuracy, report in results:
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{report}\n")

Model: SVM
Accuracy: 0.9501
Classification Report:
              precision    recall  f1-score   support

    negative       0.95      1.00      0.97      1834
    positive       0.99      0.64      0.78       291

    accuracy                           0.95      2125
   macro avg       0.97      0.82      0.88      2125
weighted avg       0.95      0.95      0.95      2125


Model: Random Forest
Accuracy: 0.9680
Classification Report:
              precision    recall  f1-score   support

    negative       0.97      1.00      0.98      1834
    positive       0.99      0.77      0.87       291

    accuracy                           0.97      2125
   macro avg       0.98      0.89      0.93      2125
weighted avg       0.97      0.97      0.97      2125


Model: Gradient Boosting
Accuracy: 0.9581
Classification Report:
              precision    recall  f1-score   support

    negative       0.96      1.00      0.98      1834
    positive       0.99      0.70      0.82       291

   