# 1. Find Bias

In [1]:
import json

# Function to read JSON data from a file
def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Function to assess political bias with more detailed categories
# def assess_political_bias(text, left_keywords, right_keywords):
#     # Count matches for left and right keywords
#     left_count = sum(1 for keyword in left_keywords if keyword in text.lower())
#     right_count = sum(1 for keyword in right_keywords if keyword in text.lower())

#     # Determine the political bias based on keyword counts
#     if left_count > 0 and right_count == 0:
#         if left_count > 2:
#             return "Slightly Left"
#         return "Center"
    
#     if right_count > 0 and left_count == 0:
#         if right_count > 2:
#             return "Slightly Right"
#         return "Center"

#     if left_count > right_count:
#         return "Slightly Left"
    
#     if right_count > left_count:
#         return "Slightly Right"
    
#     # If both left and right keywords are present in a similar amount, return 'Center'
#     return "Center"x
# Function to process the JSON data and classify the bias
def process_data_and_classify_bias(json_data):
    processed_data = []
    for article in json_data:
        # Extract keywords from the article
        left_keywords = [keyword for keyword in article.get("keywords", []) if keyword in ['socialism', 'equality', 'progressive', 'universal healthcare', 'liberal']]
        right_keywords = [keyword for keyword in article.get("keywords", []) if keyword in ['capitalism', 'free-market', 'conservative', 'traditional', 'patriotism', 'tax cuts']]

        # Classify political bias based on the article's text and keywords
        article_data = {
            "title": article.get("title", ""),
            "text": article.get("text", ""),
            "keywords": article.get("keywords", []),
            "political_bias": assess_political_bias(article.get("text", ""), left_keywords, right_keywords)
        }
        processed_data.append(article_data)
    return processed_data

# File path for the JSON data (make sure the path is correct)
file_path_articles = 'dl_data-main/analyzed_articles/llama3_8b/india_today_analyzed.json'  # Path to the articles JSON file

# Read JSON data for articles
json_data = read_json_file(file_path_articles)

# Process the data to classify bias
processed_data = process_data_and_classify_bias(json_data)

# Convert to JSON format for output
processed_json = json.dumps(processed_data, indent=4)

# Output the processed JSON with bias classifications
print(processed_json)


[
    {
        "title": "Katy Perry says 'no place like home' in first post after space trip",
        "text": "Pop star Katy Perry shared her first post following her space return on Monday. Seemingly signalling her relief of coming back to Earth, the singer expressed her happiness about returning to her home. Perry was one of the few civilians who traveled to space aboard Blue Origin's New Shepard spacecraft for 11 minutes.\n\nOn Tuesday, Katy Perry wrote on X, \"There's no place like home (red heart and Earth emoji) (sic).\"\n\nadvertisement\n\nHere's the post:\n\nPop star Katy Perry and five other women made history on Monday, becoming the first all-female crew in over 60 years to launch into space. The group flew aboard Blue Origin\u2019s NS-31 mission, marking a significant milestone in space tourism.\n\nThe New Shepard rocket carried singer Perry, along with Jeff Bezos\u2019s fiance Lauren Sanchez, civil rights advocate Amanda Nguyen, journalist Gayle King, and former NASA rock

# 2. Find Reasoning 

In [2]:
import json

# Function to read JSON data from a file
def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Function to assess political bias
def assess_political_bias(text):
    # Keywords or phrases associated with political leanings
    left_keywords = ['socialism', 'equality', 'progressive', 'universal healthcare', 'liberal']
    right_keywords = ['capitalism', 'free-market', 'conservative', 'traditional', 'patriotism', 'tax cuts']
    slightly_left_keywords = ['social democracy', 'left-wing', 'green new deal']
    slightly_right_keywords = ['libertarian', 'market economy', 'individual freedom']
    extreme_right_keywords = ['far-right', 'alt-right', 'white nationalism']

    # Checking for Extreme Right-Leaning Phrases
    if any(keyword in text.lower() for keyword in extreme_right_keywords):
        return "Extreme Right"
    
    # Checking for Right-Leaning Phrases
    if any(keyword in text.lower() for keyword in right_keywords):
        return "Right"
    
    # Checking for Slightly Right-Leaning Phrases
    if any(keyword in text.lower() for keyword in slightly_right_keywords):
        return "Slightly Right"
    
    # Checking for Slightly Left-Leaning Phrases
    if any(keyword in text.lower() for keyword in slightly_left_keywords):
        return "Slightly Left"
    
    # Checking for Left-Leaning Phrases
    if any(keyword in text.lower() for keyword in left_keywords):
        return "Slightly Left"

    # Default: Center or Neutral if no bias detected
    return "Center"  # Default can be 'Slightly Center' if there is minor leaning but no strong words.

# Function to generate reasoning based on the bias classification and article title
def generate_reasoning(category, title):
    if category == "Center":
        return f"The sentiment was neutral, indicating a center-bias. Article Title: '{title}'"
    elif category == "Slightly Right":
        return f"The sentiment was positive, possibly supporting a right-leaning view. Article Title: '{title}'"
    elif category == "Slightly Left":
        return f"The sentiment was negative, possibly indicating criticism and left-leaning view. Article Title: '{title}'"
    elif category == "Right":
        return f"The sentiment was strongly conservative, indicating a right-bias. Article Title: '{title}'"
    elif category == "Extreme Right":
        return f"The sentiment was strongly conservative, indicating an extreme right-bias. Article Title: '{title}'"
    else:
        return f"Bias classification could not be determined. Article Title: '{title}'"

# Function to process the JSON data and classify the bias
def process_data_and_classify_bias(json_data):
    processed_data = []
    for article in json_data:
        # Get the political bias classification
        bias = assess_political_bias(article.get("text", ""))
        
        # Generate reasoning based on bias classification and article title
        reasoning = generate_reasoning(bias, article.get("title", ""))
        
        # Prepare the processed data
        article_data = {
            "title": article.get("title", ""),
            "text": article.get("text", ""),
            "keywords": article.get("keywords", []),
            "political_bias": bias,
            "Reasoning": reasoning
        }
        
        processed_data.append(article_data)
    
    return processed_data

# File path for the JSON data (make sure the path is correct)
file_path = 'dl_data-main/analyzed_articles/llama3_8b/india_today_analyzed.json'

# Read JSON data from the file
json_data = read_json_file(file_path)

# Process the data to classify bias and add reasoning
processed_data = process_data_and_classify_bias(json_data)

# Convert to JSON format for output
processed_json = json.dumps(processed_data, indent=4)

# Output the processed JSON with bias classifications and reasoning
print(processed_json)


[
    {
        "title": "Katy Perry says 'no place like home' in first post after space trip",
        "text": "Pop star Katy Perry shared her first post following her space return on Monday. Seemingly signalling her relief of coming back to Earth, the singer expressed her happiness about returning to her home. Perry was one of the few civilians who traveled to space aboard Blue Origin's New Shepard spacecraft for 11 minutes.\n\nOn Tuesday, Katy Perry wrote on X, \"There's no place like home (red heart and Earth emoji) (sic).\"\n\nadvertisement\n\nHere's the post:\n\nPop star Katy Perry and five other women made history on Monday, becoming the first all-female crew in over 60 years to launch into space. The group flew aboard Blue Origin\u2019s NS-31 mission, marking a significant milestone in space tourism.\n\nThe New Shepard rocket carried singer Perry, along with Jeff Bezos\u2019s fiance Lauren Sanchez, civil rights advocate Amanda Nguyen, journalist Gayle King, and former NASA rock

# 3. Apply ML/DL model

In [3]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Step 1: Read JSON data
def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Step 2: Load and prepare dataset
def prepare_dataframe(json_data):
    df = pd.DataFrame(json_data)
    df = df[df['text'].str.strip() != ""]  # Drop entries with empty text
    return df

# Step 3: Preprocessing and Vectorization
def vectorize_text(df):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
    X = vectorizer.fit_transform(df['text']).toarray()
    return X, vectorizer

# Step 4: Encode bias labels
def encode_labels(df):
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['political_bias'])
    return y, label_encoder

# Step 5: Train a Gaussian Naive Bayes model
def train_model(X, y):
    model = GaussianNB()
    model.fit(X, y)
    return model

# Step 6: Generate reasoning based on predicted category
def generate_reasoning(category, title):
    templates = {
        "Center": f"The sentiment was neutral, indicating a center-bias. Article Title: '{title}'",
        "Slightly Right": f"The sentiment was positive, possibly supporting a right-leaning view. Article Title: '{title}'",
        "Slightly Left": f"The sentiment was negative, possibly indicating criticism and left-leaning view. Article Title: '{title}'",
        "Right": f"The sentiment was strongly conservative, indicating a right-bias. Article Title: '{title}'",
        "Slightly Center": f"The sentiment was mildly neutral, indicating a slightly centrist view. Article Title: '{title}'",
        "Extreme Right": f"The sentiment was strongly conservative, suggesting an extreme right-bias. Article Title: '{title}'",
    }
    return templates.get(category, f"Bias classification could not be determined. Article Title: '{title}'")

# Step 7: Predict and classify new data
def classify_articles(model, vectorizer, label_encoder, df):
    X_test = vectorizer.transform(df['text']).toarray()
    y_pred_encoded = model.predict(X_test)
    y_pred_labels = label_encoder.inverse_transform(y_pred_encoded)

    output = []
    for i, article in df.iterrows():
        bias = y_pred_labels[i]
        reasoning = generate_reasoning(bias, article['title'])
        output.append({
            "title": article['title'],
            "text": article['text'],
            "keywords": article.get('keywords', []),
            "political_bias": bias,
            "Reasoning": reasoning
        })

    return output

# Main execution
if __name__ == "__main__":
    # Load and preprocess data
    file_path = 'dl_data-main/analyzed_articles/llama3_8b/india_today_analyzed.json'
    json_data = read_json_file(file_path)
    df = prepare_dataframe(json_data)

    # Manually label bias for training
    df['political_bias'] = df['text'].apply(lambda t: 
        "Slightly Left" if any(w in t.lower() for w in ['socialism', 'progressive', 'equality']) else
        "Slightly Right" if any(w in t.lower() for w in ['conservative', 'free-market', 'tax cuts']) else
        "Center" if any(w in t.lower() for w in ['neutral', 'moderate', 'centrist']) else
        "Slightly Center" if any(w in t.lower() for w in ['center-right', 'center-left']) else
        "Extreme Right" if any(w in t.lower() for w in ['far-right', 'extreme right', 'ultra-conservative']) else
        "Right"
    )

    # Vectorize and encode
    X, vectorizer = vectorize_text(df)
    y, label_encoder = encode_labels(df)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train model
    model = train_model(X_train, y_train)

    # Evaluate model (handles mismatch between actual classes in y_test and total classes)
    y_test_pred = model.predict(X_test)
    print("Model Evaluation Report:\n", classification_report(
        y_test, 
        y_test_pred, 
        labels=label_encoder.transform(label_encoder.classes_), 
        target_names=label_encoder.classes_, 
        zero_division=0
    ))

    # Final classification on full dataset
    processed_data = classify_articles(model, vectorizer, label_encoder, df)

    # Export or print result
    processed_json = json.dumps(processed_data, indent=4, ensure_ascii=False)
    print(processed_json)


Model Evaluation Report:
                 precision    recall  f1-score   support

        Center       0.50      1.00      0.67         1
 Extreme Right       0.00      0.00      0.00         0
         Right       0.97      0.97      0.97        38
 Slightly Left       0.00      0.00      0.00         1
Slightly Right       0.00      0.00      0.00         0

      accuracy                           0.95        40
     macro avg       0.29      0.39      0.33        40
  weighted avg       0.94      0.95      0.94        40

[
    {
        "title": "Katy Perry says 'no place like home' in first post after space trip",
        "text": "Pop star Katy Perry shared her first post following her space return on Monday. Seemingly signalling her relief of coming back to Earth, the singer expressed her happiness about returning to her home. Perry was one of the few civilians who traveled to space aboard Blue Origin's New Shepard spacecraft for 11 minutes.\n\nOn Tuesday, Katy Perry wrote on X

# 4. Save to csv files - india_today_comparison.csv and india_today_pivot.csv

In [4]:
path="dl_data-main/analyzed_articles/india_today_pivot.csv"
pd.read_csv(path).columns


Index(['article_id', 'title', 'url', 'llama3:8b'], dtype='object')

In [50]:
# india_today_comparison.csv - ['article_id', 'title', 'url', 'model', 'bias_category', 'reasoning']
# india_today_pivot.csv - ['article_id', 'title', 'url', 'model'] and in this file under model bias values

In [5]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

# Step 1: Read JSON data
def read_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Step 2: Load and prepare dataset
def prepare_dataframe(json_data):
    df = pd.DataFrame(json_data)
    df = df[df['text'].str.strip() != ""]  # Drop entries with empty text
    return df

# Step 3: Preprocessing and Vectorization
def vectorize_text(df):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
    X = vectorizer.fit_transform(df['text']).toarray()
    return X, vectorizer

# Step 4: Encode bias labels
def encode_labels(df):
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['political_bias'])
    return y, label_encoder

# Step 5: Train a Gaussian Naive Bayes model
def train_model(X, y):
    model = GaussianNB()
    model.fit(X, y)
    return model

# Step 6: Generate reasoning based on predicted category
def generate_reasoning(category, title):
    templates = {
        "Center": f"The sentiment was neutral, indicating a center-bias. Article Title: '{title}'",
        "Slightly Right": f"The sentiment was positive, possibly supporting a right-leaning view. Article Title: '{title}'",
        "Slightly Left": f"The sentiment was negative, possibly indicating criticism and left-leaning view. Article Title: '{title}'",
        "Right": f"The sentiment was strongly conservative, indicating a right-bias. Article Title: '{title}'",
        "Slightly Center": f"The sentiment was slightly off-center but leaning neutral. Article Title: '{title}'",
        "Extreme Right": f"The sentiment was highly conservative, indicating an extreme right-bias. Article Title: '{title}'",
    }
    return templates.get(category, f"Bias classification could not be determined. Article Title: '{title}'")

# Step 7: Predict and classify new data
def classify_articles(model, vectorizer, label_encoder, df):
    X_test = vectorizer.transform(df['text']).toarray()
    y_pred_encoded = model.predict(X_test)
    y_pred_labels = label_encoder.inverse_transform(y_pred_encoded)

    output = []
    for i, article in df.iterrows():
        bias = y_pred_labels[i]
        reasoning = generate_reasoning(bias, article['title'])
        output.append({
            "title": article['title'],
            "text": article['text'],
            "keywords": article.get('keywords', []),
            "political_bias": bias,
            "Reasoning": reasoning
        })
    return output

# Main execution
if __name__ == "__main__":
    # Load and preprocess data
    file_path = 'dl_data-main/analyzed_articles/llama3_8b/india_today_analyzed.json'
    json_data = read_json_file(file_path)
    df = prepare_dataframe(json_data)

    # Manually label bias for training (for demo, we’ll use current logic)
    df['political_bias'] = df['text'].apply(lambda t:
        "Slightly Left" if any(w in t.lower() for w in ['socialism', 'progressive', 'equality']) else
        "Slightly Right" if any(w in t.lower() for w in ['conservative', 'free-market', 'tax cuts']) else
        "Center" if any(w in t.lower() for w in ['neutral', 'balanced', 'unbiased']) else
        "Slightly Center" if any(w in t.lower() for w in ['center-right', 'moderate']) else
        "Extreme Right" if any(w in t.lower() for w in ['extreme', 'radical right']) else
        "Right"
    )

    # Vectorize and encode
    X, vectorizer = vectorize_text(df)
    y, label_encoder = encode_labels(df)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train model
    model = train_model(X_train, y_train)

    # Evaluate model
    y_test_pred = model.predict(X_test)
    print("Model Evaluation Report:\n", classification_report(y_test, y_test_pred, labels=[0, 1, 2, 3, 4, 5], target_names=label_encoder.classes_))

    # Final classification on full dataset
    processed_data = classify_articles(model, vectorizer, label_encoder, df)

    # Assign article_id and dummy URL for this demo
    for i, row in enumerate(processed_data):
        row['article_id'] = f"A{i+1:04d}"
        row['url'] = f"https://www.indiatoday.in/article/{i+1}"
        row['model'] = "naive_bayes"

    # ---- Create india_today_comparison.csv ----
    comparison_df = pd.DataFrame(processed_data)[[
        'article_id', 'title', 'url', 'model', 'political_bias', 'Reasoning'
    ]]
    comparison_df.columns = ['article_id', 'title', 'url', 'model', 'bias_category', 'reasoning']
    comparison_df.to_csv('india_today_comparison.csv', index=False)
    print("\nSaved: india_today_comparison.csv")

    # ---- Create india_today_pivot.csv ----
    pivot_df = comparison_df.pivot_table(
        index=['article_id', 'title', 'url'],
        columns='model',
        values='bias_category',
        aggfunc='first'
    ).reset_index()
    pivot_df.columns.name = None
    pivot_df.to_csv('india_today_pivot.csv', index=False)
    print(" Saved: india_today_pivot.csv")


Model Evaluation Report:
                  precision    recall  f1-score   support

         Center       0.33      0.25      0.29         4
  Extreme Right       0.00      0.00      0.00         0
          Right       0.89      0.94      0.92        35
Slightly Center       0.00      0.00      0.00         0
  Slightly Left       0.00      0.00      0.00         1
 Slightly Right       0.00      0.00      0.00         0

       accuracy                           0.85        40
      macro avg       0.20      0.20      0.20        40
   weighted avg       0.81      0.85      0.83        40


Saved: india_today_comparison.csv
 Saved: india_today_pivot.csv


In [18]:
pd.read_csv("dl_data-main/analyzed_articles/india_today_comparison.csv")["bias_category"].unique()

array(['Center', 'Slightly Left', 'Slightly Right', 'Right',
       'Slightly Center', 'Extreme Right'], dtype=object)