In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv("/testdata.csv")

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['full_text'], df['sentiment_type'], test_size=0.2, random_state=42)

# Transforming text data into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Using top 5000 most frequent words
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initializing and training the Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train_tfidf, y_train)

# Predicting the sentiment for test set
y_pred = logreg.predict(X_test_tfidf)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7832273690584317

Classification Report:
               precision    recall  f1-score   support

    NEGATIVE       0.71      0.49      0.58       573
     NEUTRAL       0.80      0.76      0.78       977
    POSITIVE       0.79      0.89      0.84      1753

    accuracy                           0.78      3303
   macro avg       0.77      0.71      0.73      3303
weighted avg       0.78      0.78      0.78      3303



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [2]:
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# 1. Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# If you want to compute ROC AUC, ensure your labels are binarized or use one-vs-rest approach
if df['sentiment_type'].nunique() == 2:
    # 2. ROC AUC Score for binary classification
    y_prob = logreg.predict_proba(X_test_tfidf)[:, 1]
    print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

    # ROC Curve plotting
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

# If the sentiment is multi-class, you can compute and display the macro and weighted average ROC AUC scores:
else:
    y_prob = logreg.predict_proba(X_test_tfidf)
    macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", average="macro")
    weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo", average="weighted")
    macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr", average="macro")
    weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted")
    print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
          "(weighted by prevalence)"
          .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
    print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
          "(weighted by prevalence)"
          .format(macro_roc_auc_ovr, weighted_roc_auc_ovr))


Confusion Matrix:
 [[ 283   93  197]
 [  19  739  219]
 [  97   91 1565]]
One-vs-One ROC AUC scores:
0.908466 (macro),
0.911285 (weighted by prevalence)
One-vs-Rest ROC AUC scores:
0.913790 (macro),
0.916854 (weighted by prevalence)


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("/testdata.csv")

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['full_text'], df['sentiment_type'], test_size=0.2, random_state=42)

# Transforming text data into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Using top 5000 most frequent words
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initializing and training the Random Forest model
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)

# Predicting the sentiment for test set
y_pred = rf.predict(X_test_tfidf)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# If binary classification, display ROC AUC
if df['sentiment_type'].nunique() == 2:
    y_prob = rf.predict_proba(X_test_tfidf)[:, 1]
    print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()


Accuracy: 0.706630336058129

Classification Report:
               precision    recall  f1-score   support

    NEGATIVE       0.85      0.13      0.22       573
     NEUTRAL       0.76      0.65      0.70       977
    POSITIVE       0.68      0.93      0.79      1753

    accuracy                           0.71      3303
   macro avg       0.76      0.57      0.57      3303
weighted avg       0.73      0.71      0.66      3303

Confusion Matrix:
 [[  73   84  416]
 [   4  633  340]
 [   9  116 1628]]


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the dataset
df = pd.read_csv("/testdata.csv")

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['full_text'], df['sentiment_type'], test_size=0.2, random_state=42)

# Transforming text data into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Using top 5000 most frequent words
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initializing and training the SVM model
# Using a linear kernel for SVM
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train_tfidf, y_train)

# Predicting the sentiment for test set
y_pred = svm.predict(X_test_tfidf)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8023009385407206

Classification Report:
               precision    recall  f1-score   support

    NEGATIVE       0.70      0.57      0.62       573
     NEUTRAL       0.81      0.81      0.81       977
    POSITIVE       0.83      0.87      0.85      1753

    accuracy                           0.80      3303
   macro avg       0.78      0.75      0.76      3303
weighted avg       0.80      0.80      0.80      3303


Confusion Matrix:
 [[ 325   79  169]
 [  29  794  154]
 [ 113  109 1531]]
