In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import cohen_kappa_score

# Load data
df = pd.read_csv("annotated_reviews.csv")

print("Class Distribution:")
print(df['label'].value_counts())

# Simulate second annotator disagreement
df['annotator2'] = df['label']
df.loc[0:4, 'annotator2'] = "Neutral"

kappa = cohen_kappa_score(df['label'], df['annotator2'])
print("\nCohen Kappa Score:", kappa)

# Train simple model
X = df['review']
y = df['label']

vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Class Distribution:
label
Positive    22
Negative    18
Neutral     11
Name: count, dtype: int64

Cohen Kappa Score: 0.8805620608899297

Accuracy: 0.5454545454545454

Classification Report:

              precision    recall  f1-score   support

    Negative       0.50      0.50      0.50         4
     Neutral       0.00      0.00      0.00         2
    Positive       0.57      0.80      0.67         5

    accuracy                           0.55        11
   macro avg       0.36      0.43      0.39        11
weighted avg       0.44      0.55      0.48        11



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
