In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [2]:
def load_data(neg_file, pos_file):
    with open(neg_file, 'r', encoding='latin-1') as f_neg, open(pos_file, 'r', encoding='latin-1') as f_pos:
        neg_sentences = f_neg.readlines()
        pos_sentences = f_pos.readlines()
    return neg_sentences, pos_sentences

# File paths (adjust if necessary)
neg_file = 'rt-polarity.neg'
pos_file = 'rt-polarity.pos'

In [3]:
neg_sentences, pos_sentences = load_data(neg_file, pos_file)

# Step 2: Prepare Data Splits
# Training: 4,000 each, Validation: 500 each, Test: 831 each
train_neg = neg_sentences[:4000]
train_pos = pos_sentences[:4000]
val_neg = neg_sentences[4000:4500]
val_pos = pos_sentences[4000:4500]
test_neg = neg_sentences[4500:]
test_pos = pos_sentences[4500:]

# Create labels
train_data = train_neg + train_pos
train_labels = [0] * 4000 + [1] * 4000

val_data = val_neg + val_pos
val_labels = [0] * 500 + [1] * 500

test_data = test_neg + test_pos
test_labels = [0] * 831 + [1] * 831

In [4]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')


X_train = vectorizer.fit_transform(train_data)
X_val = vectorizer.transform(val_data)
X_test = vectorizer.transform(test_data)

In [5]:
model = LogisticRegression()
model.fit(X_train, train_labels)


In [6]:
train_predictions = model.predict(X_train)
train_precision = precision_score(train_labels, train_predictions)
train_recall = recall_score(train_labels, train_predictions)
train_f1 = f1_score(train_labels, train_predictions)


conf_matrix_train = confusion_matrix(train_labels, train_predictions)
TP_train = conf_matrix_train[1, 1]
TN_train = conf_matrix_train[0, 0]
FP_train = conf_matrix_train[0, 1]
FN_train = conf_matrix_train[1, 0]

#training set metrics
print(f'Training Precision: {train_precision}')
print(f'Training Recall: {train_recall}')
print(f'Training F1-Score: {train_f1}')
print(f'TP (Train): {TP_train}, TN (Train): {TN_train}, FP (Train): {FP_train}, FN (Train): {FN_train}')

Training Precision: 0.8755321813173053
Training Recall: 0.874
Training F1-Score: 0.8747654197422745
TP (Train): 3496, TN (Train): 3503, FP (Train): 497, FN (Train): 504


In [7]:
#validation set metrics

val_predictions = model.predict(X_val)
val_precision = precision_score(val_labels, val_predictions)
val_recall = recall_score(val_labels, val_predictions)
val_f1 = f1_score(val_labels, val_predictions)


print(f'Validation Precision: {val_precision}')
print(f'Validation Recall: {val_recall}')
print(f'Validation F1-Score: {val_f1}')

Validation Precision: 0.7858672376873662
Validation Recall: 0.734
Validation F1-Score: 0.7590486039296794


In [8]:
#test set metrics

test_predictions = model.predict(X_test)
test_precision = precision_score(test_labels, test_predictions)
test_recall = recall_score(test_labels, test_predictions)
test_f1 = f1_score(test_labels, test_predictions)


conf_matrix_test = confusion_matrix(test_labels, test_predictions)
TP_test = conf_matrix_test[1, 1]
TN_test = conf_matrix_test[0, 0]
FP_test = conf_matrix_test[0, 1]
FN_test = conf_matrix_test[1, 0]


print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')
print(f'Test F1-Score: {test_f1}')
print(f'TP (Test): {TP_test}, TN (Test): {TN_test}, FP (Test): {FP_test}, FN (Test): {FN_test}')

Test Precision: 0.7389221556886227
Test Recall: 0.7424789410348978
Test F1-Score: 0.7406962785114045
TP (Test): 617, TN (Test): 613, FP (Test): 218, FN (Test): 214
