In [24]:
import re
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score


In [54]:
def preprocess_text(text):
    # Remove special characters except apostrophes and hyphens
    text = text.lower()
    text = re.sub(r"[^\w\s\'-]", "", text)
    return text

In [38]:
# Clean data and confirm breakdown of dataset
file_path = "./data/cnn_articles_large_trimmed_labeled_3.csv"
df = pd.read_csv(file_path)

df["cleaned_text"] = df["Article text"].apply(preprocess_text)
print(f"Length of the dataset: {len(df)}")

num_unique_authors = df['Author'].nunique()
print(f"Number of unique authors: {num_unique_authors}")

sentiment_label_counts = df['y_label_sentiment'].value_counts()
political_lean_label_counts = df['y_label_politics'].value_counts()

print("\nCount of each sentiment label:")
print(sentiment_label_counts)

print("\nCount of each political lean label:")
print(political_lean_label_counts)


Length of the dataset: 2461
Number of unique authors: 1234

Count of each sentiment label:
y_label_sentiment
positive    2213
negative     248
Name: count, dtype: int64

Count of each political lean label:
y_label_politics
liberal         1690
conservative     771
Name: count, dtype: int64


# Model: Sentiment, Logistic Regression
(Initial test with a single model)

In [11]:
# Split the data into training and testing sets for sentiment analysis
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["y_label_sentiment"], test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Build a pipeline with a TF-IDF vectorizer and a logistic regression model
pipeline = Pipeline([
    ("tfidf", tfidf_vectorizer),
    ("clf", LogisticRegression())
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Sentiment Analysis")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=1))



Sentiment Analysis
Accuracy: 0.9026369168356998
              precision    recall  f1-score   support

    negative       1.00      0.00      0.00        48
    positive       0.90      1.00      0.95       445

    accuracy                           0.90       493
   macro avg       0.95      0.50      0.47       493
weighted avg       0.91      0.90      0.86       493



# Model: Political, Logistic Regression

In [12]:
X_train_politics, X_test_politics, y_train_politics, y_test_politics = train_test_split(df["cleaned_text"], df["y_label_politics"], test_size=0.2, random_state=42)

# Build a pipeline for political lean classification
pipeline_politics = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000)),
    ("clf", LogisticRegression())
])

pipeline_politics.fit(X_train_politics, y_train_politics)

y_pred_politics = pipeline_politics.predict(X_test_politics)

print("Political Lean Analysis")
print("Accuracy:", accuracy_score(y_test_politics, y_pred_politics))
print(classification_report(y_test_politics, y_pred_politics, zero_division=1))

Political Lean Analysis
Accuracy: 0.8296146044624746
              precision    recall  f1-score   support

conservative       0.86      0.56      0.68       159
     liberal       0.82      0.96      0.88       334

    accuracy                           0.83       493
   macro avg       0.84      0.76      0.78       493
weighted avg       0.83      0.83      0.82       493



# Base ML Models Sentiment and Political Classification & Comparison

In [34]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Neural Network": MLPClassifier(max_iter=500)
}

In [41]:
# Generalized evaluation function for sentiment analysis and political bias
def evaluate_model(model, X_train, X_test, y_train, y_test, pos_label):
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000)),
        ("clf", model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label=pos_label, average="binary", zero_division=1)
    recall = recall_score(y_test, y_pred, pos_label=pos_label, average="binary", zero_division=1)
    f1 = f1_score(y_test, y_pred, pos_label=pos_label, average="binary", zero_division=1)
    return accuracy, precision, recall, f1

In [44]:
# Evaluate each model for sentiment analysis
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["y_label_sentiment"], test_size=0.2, random_state=42)

sentiment_metrics = {"Model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for model_name, model in models.items():
    accuracy, precision, recall, f1 = evaluate_model(model, X_train, X_test, y_train, y_test, pos_label="positive")
    sentiment_metrics["Model"].append(model_name)
    sentiment_metrics["Accuracy"].append(accuracy)
    sentiment_metrics["Precision"].append(precision)
    sentiment_metrics["Recall"].append(recall)
    sentiment_metrics["F1-Score"].append(f1)

In [50]:
# Evaluate each model for political lean classification
X_train_politics, X_test_politics, y_train_politics, y_test_politics = train_test_split(df["cleaned_text"], df["y_label_politics"], test_size=0.2, random_state=42)

political_lean_metrics = {"Model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1-Score": []}

for model_name, model in models.items():
    accuracy, precision, recall, f1 = evaluate_model(model, X_train_politics, X_test_politics, y_train_politics, y_test_politics, pos_label="liberal")
    political_lean_metrics["Model"].append(model_name)
    political_lean_metrics["Accuracy"].append(accuracy)
    political_lean_metrics["Precision"].append(precision)
    political_lean_metrics["Recall"].append(recall)
    political_lean_metrics["F1-Score"].append(f1)

In [51]:
# output_path_sentiment = "./data/metrics_sentiment_base_models.csv"
# output_path_political = "./data/metrics_political_base_models_liberal.csv"

sentiment_metrics_df = pd.DataFrame(sentiment_metrics)
political_lean_metrics_df = pd.DataFrame(political_lean_metrics)

# Display the comparison tables
print("Sentiment Analysis Model Comparison")
print(sentiment_metrics_df)
# sentiment_metrics_df.to_csv(output_path_sentiment, index=False)

print("\nPolitical Lean Analysis Model Comparison")
print(political_lean_metrics_df)
# political_lean_metrics_df.to_csv(output_path_political, index=False)

Sentiment Analysis Model Comparison
                    Model  Accuracy  Precision    Recall  F1-Score
0     Logistic Regression  0.902637   0.902637  1.000000  0.948827
1  Support Vector Machine  0.902637   0.902637  1.000000  0.948827
2           Random Forest  0.902637   0.902637  1.000000  0.948827
3          Neural Network  0.910751   0.916840  0.991011  0.952484

Political Lean Analysis Model Comparison
                    Model  Accuracy  Precision    Recall  F1-Score
0     Logistic Regression  0.829615   0.820513  0.958084  0.883978
1  Support Vector Machine  0.809331   0.819149  0.922156  0.867606
2           Random Forest  0.870183   0.855263  0.973054  0.910364
3          Neural Network  0.805274   0.838068  0.883234  0.860058
