### ML models

In [11]:
import os
import json
import time
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.multiclass import unique_labels

# Paths
json_dir = "dl_data-main/analyzed_articles/llama3_8b"
output_dir = "dl_data-main/analyzed_articles/ml_model"
os.makedirs(output_dir, exist_ok=True)
all_data = []

for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(json_dir, filename)
        with open(file_path, 'r') as f:
            try:
                file_data = json.load(f)
                all_data.extend(file_data)
            except json.JSONDecodeError as e:
                print(f"Error reading {filename}: {e}")

df = pd.DataFrame(all_data)

# --- Normalize 'analysis' field ---
if 'analysis' in df.columns:
    analysis_df = pd.json_normalize(df['analysis'])
    df = pd.concat([df.drop(columns=['analysis']), analysis_df], axis=1)

# Preprocess
def preprocess_text(text):
    if isinstance(text, str):
        return text.lower()
    elif isinstance(text, list):
        return " ".join(str(item).lower() for item in text)
    else:
        return str(text).lower()

for col in ['source', 'title', 'text', 'keywords']:
    if col in df.columns:
        df[col] = df[col].apply(preprocess_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X = vectorizer.fit_transform(df['text'].astype(str))

# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['bias_category'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
models = {
    "logistic_regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "linear_svm": LinearSVC(class_weight="balanced"),
    "random_forest": RandomForestClassifier(n_estimators=200, class_weight="balanced")
}

best_model = None
best_accuracy = 0

for name, model in models.items():
    print(f"\nTraining model: {name}")
    start = time.time()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"Accuracy: {acc:.4f}")
    present_labels = unique_labels(y_test, preds)
    present_names = label_encoder.inverse_transform(present_labels)
    print(classification_report(y_test, preds, labels=present_labels, target_names=present_names))

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = (name, model)
    print(f"Training time: {time.time() - start:.2f}s")

model_name, model = best_model
print(f"\nBest model selected: {model_name} (Accuracy: {best_accuracy:.4f})")

# Rule-based Bias Detection
bias_keywords_left = {
    'extreme left': ['radical', 'communism', 'revolution', 'anarchy'],
    'left': ['progressive', 'liberal', 'social justice', 'equality'],
    'slightly left': ['democratic', 'equality', 'human rights', 'inclusive']
}

bias_keywords_right = {
    'extreme right': ['fascism', 'authoritarian', 'nationalism', 'white supremacy'],
    'right': ['conservative', 'traditional', 'patriot', 'family values'],
    'slightly right': ['republican', 'libertarian', 'freedom', 'capitalism']
}

def detect_bias(text):
    if not isinstance(text, str):
        return 'Center'
    words = re.findall(r'\b\w+\b', text.lower())
    left_score = sum(any(word in words for word in kw) for kw in bias_keywords_left.values())
    right_score = sum(any(word in words for word in kw) for kw in bias_keywords_right.values())

    if left_score > right_score:
        if left_score > 3: return 'Extreme Left'
        elif left_score > 1: return 'Left'
        else: return 'Slightly Left'
    elif right_score > left_score:
        if right_score > 3: return 'Extreme Right'
        elif right_score > 1: return 'Right'
        else: return 'Slightly Right'
    else:
        return 'Center'

# Predict for all JSONs
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(json_dir, filename)
        try:
            with open(file_path, 'r') as f:
                articles = json.load(f)
        except Exception as e:
            print(f"Skipping {filename}: {e}")
            continue

        file_df = pd.DataFrame(articles)
        if file_df.empty or 'text' not in file_df.columns:
            continue

        for col in ['source', 'title', 'text', 'keywords']:
            if col in file_df.columns:
                file_df[col] = file_df[col].apply(preprocess_text)

        # TF-IDF transform and predict
        X_file = vectorizer.transform(file_df['text'].astype(str))
        pred_classes = model.predict(X_file)

        # Rule-based prediction
        file_df['predicted_bias_rule'] = file_df['text'].apply(detect_bias)

        # Save comparison CSV
        comparison_data = []
        for i in range(len(file_df)):
            article_id = f"A{i+1:04d}"
            title = file_df.loc[i, 'title'] if 'title' in file_df.columns else ''
            url = file_df.loc[i, 'url'] if 'url' in file_df.columns else ''
            reasoning = file_df.loc[i, 'reasoning'] if 'reasoning' in file_df.columns else ''
            predicted_bias = label_encoder.inverse_transform([pred_classes[i]])[0]
            comparison_data.append({
                'article_id': article_id,
                'title': title,
                'url': url,
                'model': model_name,
                'bias_category': predicted_bias,
                'reasoning': reasoning
            })

        comparison_df = pd.DataFrame(comparison_data)
        comparison_filename = filename.replace(".json", "_comparison.csv")
        comparison_df.to_csv(os.path.join(output_dir, comparison_filename), index=False)
        print(f"Saved: {comparison_filename}")

        # Save pivot
        pivot_df = comparison_df.pivot_table(
            index=['article_id', 'title', 'url'],
            columns='model',
            values='bias_category',
            aggfunc='first'
        ).reset_index()
        pivot_df.columns.name = None
        pivot_filename = filename.replace(".json", "_pivot.csv")
        pivot_df.to_csv(os.path.join(output_dir, pivot_filename), index=False)
        print(f"Saved pivot: {pivot_filename}")



Training model: logistic_regression
Accuracy: 0.7405
                       precision    recall  f1-score   support

               Center       0.93      0.81      0.87       686
         Center-Right       0.00      0.00      0.00         0
          Center/Left       0.00      0.00      0.00         0
 Center/Slightly Left       0.00      0.00      0.00         0
Center/Slightly Right       0.00      0.00      0.00         2
        Extreme Right       0.00      0.00      0.00         3
                Right       0.00      0.00      0.00         5
      Slightly Center       0.10      0.25      0.14         4
        Slightly Left       0.36      0.48      0.41        73
       Slightly Right       0.26      0.45      0.33        67

             accuracy                           0.74       840
            macro avg       0.16      0.20      0.17       840
         weighted avg       0.81      0.74      0.77       840

Training time: 0.85s

Training model: linear_svm


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.8131
                       precision    recall  f1-score   support

               Center       0.89      0.93      0.91       686
         Center-Right       0.00      0.00      0.00         0
 Center/Slightly Left       0.00      0.00      0.00         0
Center/Slightly Right       0.00      0.00      0.00         2
        Extreme Right       0.00      0.00      0.00         3
                Right       0.00      0.00      0.00         5
      Slightly Center       0.00      0.00      0.00         4
        Slightly Left       0.49      0.33      0.39        73
       Slightly Right       0.37      0.33      0.35        67

             accuracy                           0.81       840
            macro avg       0.19      0.18      0.18       840
         weighted avg       0.80      0.81      0.80       840

Training time: 0.60s

Training model: random_forest
Accuracy: 0.7560
                       precision    recall  f1-score   support

               Center       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved: the_indian_express_analyzed_comparison.csv
Saved pivot: the_indian_express_analyzed_pivot.csv
Saved: ndtv_analyzed_comparison.csv
Saved pivot: ndtv_analyzed_pivot.csv
Saved: the_hindu_analyzed_comparison.csv
Saved pivot: the_hindu_analyzed_pivot.csv
Saved: news18_analyzed_comparison.csv
Saved pivot: news18_analyzed_pivot.csv
Saved: times_of_india_analyzed_comparison.csv
Saved pivot: times_of_india_analyzed_pivot.csv
Saved: zee_news_analyzed_comparison.csv
Saved pivot: zee_news_analyzed_pivot.csv
Saved: india_today_analyzed_comparison.csv
Saved pivot: india_today_analyzed_pivot.csv


### Included sampling techniques(smote) and also added xgboost

In [7]:
import os
import json
import time
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.multiclass import unique_labels
from imblearn.over_sampling import SMOTE
from collections import Counter

# Paths
json_dir = "dl_data-main/analyzed_articles/llama3_8b"
output_dir = "dl_data-main/ml_model"
os.makedirs(output_dir, exist_ok=True)
all_data = []

# Load and preprocess data
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(json_dir, filename)
        with open(file_path, 'r') as f:
            try:
                file_data = json.load(f)
                all_data.extend(file_data)
            except json.JSONDecodeError as e:
                print(f"Error reading {filename}: {e}")

df = pd.DataFrame(all_data)

# --- Normalize 'analysis' field ---
if 'analysis' in df.columns:
    analysis_df = pd.json_normalize(df['analysis'])
    df = pd.concat([df.drop(columns=['analysis']), analysis_df], axis=1)

# Preprocess text columns
def preprocess_text(text):
    if isinstance(text, str):
        return text.lower()
    elif isinstance(text, list):
        return " ".join(str(item).lower() for item in text)
    else:
        return str(text).lower()

for col in ['source', 'title', 'text', 'keywords']:
    if col in df.columns:
        df[col] = df[col].apply(preprocess_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X = vectorizer.fit_transform(df['text'].astype(str))

# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['bias_category'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE Resampling (with safeguards for small classes
# Check class distribution
class_counts = Counter(y_train)
min_class_size = min(class_counts.values())

if min_class_size <= 1:
    print(f"⚠️ Skipping SMOTE: smallest class has only {min_class_size} sample(s).")
    X_train_resampled, y_train_resampled = X_train, y_train
else:
    # Dynamically set k_neighbors
    k_neighbors = min(5, min_class_size - 1)
    print(f"Applying SMOTE with k_neighbors={k_neighbors}")
    smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# # SMOTE Resampling
# smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Models
models = {
    "logistic_regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "linear_svm": LinearSVC(class_weight="balanced"),
    "random_forest": RandomForestClassifier(n_estimators=200, class_weight="balanced"),
    "xgboost": XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric='mlogloss')
}

best_model = None
best_accuracy = 0

# Train and evaluate models
for name, model in models.items():
    print(f"\nTraining model: {name}")
    start = time.time()
    model.fit(X_train_resampled, y_train_resampled)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"Accuracy: {acc:.4f}")
    present_labels = unique_labels(y_test, preds)
    present_names = label_encoder.inverse_transform(present_labels)
    print(classification_report(y_test, preds, labels=present_labels, target_names=present_names))

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = (name, model)
    print(f"Training time: {time.time() - start:.2f}s")

model_name, model = best_model
print(f"\nBest model selected: {model_name} (Accuracy: {best_accuracy:.4f})")

# Rule-based Bias Detection
bias_keywords_left = {
    'extreme left': ['radical', 'communism', 'revolution', 'anarchy'],
    'left': ['progressive', 'liberal', 'social justice', 'equality'],
    'slightly left': ['democratic', 'equality', 'human rights', 'inclusive']
}

bias_keywords_right = {
    'extreme right': ['fascism', 'authoritarian', 'nationalism', 'white supremacy'],
    'right': ['conservative', 'traditional', 'patriot', 'family values'],
    'slightly right': ['republican', 'libertarian', 'freedom', 'capitalism']
}

def detect_bias(text):
    if not isinstance(text, str):
        return 'Center'
    words = re.findall(r'\b\w+\b', text.lower())
    left_score = sum(any(word in words for word in kw) for kw in bias_keywords_left.values())
    right_score = sum(any(word in words for word in kw) for kw in bias_keywords_right.values())

    if left_score > right_score:
        if left_score > 3: return 'Extreme Left'
        elif left_score > 1: return 'Left'
        else: return 'Slightly Left'
    elif right_score > left_score:
        if right_score > 3: return 'Extreme Right'
        elif right_score > 1: return 'Right'
        else: return 'Slightly Right'
    else:
        return 'Center'

# Predict for all JSONs and generate results
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(json_dir, filename)
        try:
            with open(file_path, 'r') as f:
                articles = json.load(f)
        except Exception as e:
            print(f"Skipping {filename}: {e}")
            continue

        file_df = pd.DataFrame(articles)
        if file_df.empty or 'text' not in file_df.columns:
            continue

        for col in ['source', 'title', 'text', 'keywords']:
            if col in file_df.columns:
                file_df[col] = file_df[col].apply(preprocess_text)

        # TF-IDF transform and predict
        X_file = vectorizer.transform(file_df['text'].astype(str))
        pred_classes = model.predict(X_file)

        # Rule-based prediction
        file_df['predicted_bias_rule'] = file_df['text'].apply(detect_bias)

        # Save comparison CSV
        comparison_data = []
        for i in range(len(file_df)):
            article_id = f"A{i+1:04d}"
            title = file_df.loc[i, 'title'] if 'title' in file_df.columns else ''
            url = file_df.loc[i, 'url'] if 'url' in file_df.columns else ''
            reasoning = file_df.loc[i, 'reasoning'] if 'reasoning' in file_df.columns else ''
            predicted_bias = label_encoder.inverse_transform([pred_classes[i]])[0]
            comparison_data.append({
                'article_id': article_id,
                'title': title,
                'url': url,
                'model': model_name,
                'bias_category': predicted_bias,
                'reasoning': reasoning
            })

        comparison_df = pd.DataFrame(comparison_data)
        comparison_filename = filename.replace(".json", "_comparison.csv")
        comparison_df.to_csv(os.path.join(output_dir, comparison_filename), index=False)
        print(f"Saved: {comparison_filename}")

        # Save pivot
        pivot_df = comparison_df.pivot_table(
            index=['article_id', 'title', 'url'],
            columns='model',
            values='bias_category',
            aggfunc='first'
        ).reset_index()
        pivot_df.columns.name = None
        pivot_filename = filename.replace(".json", "_pivot.csv")
        pivot_df.to_csv(os.path.join(output_dir, pivot_filename), index=False)
        print(f"Saved pivot: {pivot_filename}")


⚠️ Skipping SMOTE: smallest class has only 1 sample(s).

Training model: logistic_regression
Accuracy: 0.7393
                       precision    recall  f1-score   support

               Center       0.93      0.81      0.87       686
         Center-Right       0.00      0.00      0.00         0
          Center/Left       0.00      0.00      0.00         0
 Center/Slightly Left       0.00      0.00      0.00         0
Center/Slightly Right       0.00      0.00      0.00         2
        Extreme Right       0.00      0.00      0.00         3
                Right       0.00      0.00      0.00         5
      Slightly Center       0.10      0.25      0.14         4
        Slightly Left       0.35      0.48      0.41        73
       Slightly Right       0.25      0.43      0.32        67

             accuracy                           0.74       840
            macro avg       0.16      0.20      0.17       840
         weighted avg       0.81      0.74      0.77       840

Train

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.8143
                       precision    recall  f1-score   support

               Center       0.89      0.93      0.91       686
         Center-Right       0.00      0.00      0.00         0
 Center/Slightly Left       0.00      0.00      0.00         0
Center/Slightly Right       0.00      0.00      0.00         2
        Extreme Right       0.00      0.00      0.00         3
                Right       0.00      0.00      0.00         5
      Slightly Center       0.00      0.00      0.00         4
        Slightly Left       0.50      0.33      0.40        73
       Slightly Right       0.37      0.33      0.35        67

             accuracy                           0.81       840
            macro avg       0.20      0.18      0.18       840
         weighted avg       0.80      0.81      0.81       840

Training time: 0.62s

Training model: random_forest
Accuracy: 0.7548
                       precision    recall  f1-score   support

               Center       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.8369
                       precision    recall  f1-score   support

               Center       0.86      0.98      0.91       686
Center/Slightly Right       0.00      0.00      0.00         2
        Extreme Right       0.00      0.00      0.00         3
                Right       0.00      0.00      0.00         5
      Slightly Center       0.00      0.00      0.00         4
        Slightly Left       0.63      0.23      0.34        73
       Slightly Right       0.52      0.21      0.30        67

             accuracy                           0.84       840
            macro avg       0.29      0.20      0.22       840
         weighted avg       0.80      0.84      0.80       840

Training time: 48.40s

Best model selected: xgboost (Accuracy: 0.8369)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved: the_indian_express_analyzed_comparison.csv
Saved pivot: the_indian_express_analyzed_pivot.csv
Saved: ndtv_analyzed_comparison.csv
Saved pivot: ndtv_analyzed_pivot.csv
Saved: the_hindu_analyzed_comparison.csv
Saved pivot: the_hindu_analyzed_pivot.csv
Saved: news18_analyzed_comparison.csv
Saved pivot: news18_analyzed_pivot.csv
Saved: times_of_india_analyzed_comparison.csv
Saved pivot: times_of_india_analyzed_pivot.csv
Saved: zee_news_analyzed_comparison.csv
Saved pivot: zee_news_analyzed_pivot.csv
Saved: india_today_analyzed_comparison.csv
Saved pivot: india_today_analyzed_pivot.csv
