In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Define data directory
data_dir = "NewB"

# Load datasets
train_data = pd.read_csv(f'{data_dir}/train_orig.txt', sep='\t', header=None, names=['label', 'sentence'])
test_data = pd.read_csv(f'{data_dir}/test.txt', sep='\t', header=None, names=['label', 'sentence'])

# Map labels to categories
label_mapping = {
    0: 'Newsday (Liberal)', 1: 'New York Times (Liberal)', 2: 'CNN (Liberal)',
    3: 'Los Angeles Times (Liberal)', 4: 'Washington Post (Liberal)',
    5: 'Politico (Neutral)', 6: 'Wall Street Journal (Conservative)',
    7: 'New York Post (Conservative)', 8: 'Daily Press (Conservative)',
    9: 'Daily Herald (Conservative)', 10: 'Chicago Tribune (Conservative)'
}
train_data['label'] = train_data['label'].map(label_mapping)
test_data['label'] = test_data['label'].map(label_mapping)

# Simplify categories to Conservative and Liberal
train_data['bias'] = train_data['label'].apply(lambda x: 'Conservative' if 'Conservative' in x else ('Liberal' if 'Liberal' in x else 'Neutral'))
test_data['bias'] = test_data['label'].apply(lambda x: 'Conservative' if 'Conservative' in x else ('Liberal' if 'Liberal' in x else 'Neutral'))

# Filter out Neutral to make classification binary
data_subset = train_data[train_data['bias'].isin(['Conservative', 'Liberal'])].copy()
X = data_subset['sentence']
y = data_subset['bias']

# Convert text to numerical representation
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = tfidf.fit_transform(X)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Predict and evaluate
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', classification_report(y_test, predictions))