In [None]:
import pandas as pd
import numpy as np

In [None]:
raw_df = pd.read_csv('data/pakistan-media-dataset-synthetic.csv')

In [None]:
df = raw_df.copy()

# Clean Data

In [None]:
df[["Region", "City"]]

## Region-City Cleaning
We can clearly see wrong regions associated with cities. Let's fix that

In [None]:
# Let's print the unique cities
df["City"].unique()

In [None]:
# Let's print the unique regions
df["Region"].unique()

In [None]:
def get_region(city):
    if city in ["Multan", "Rawalpindi", "Lahore"]: return "Punjab"
    elif city in ["Quetta"]: return "Balochistan"
    elif city in ["Karachi", "Hyderabad"]: return "Sindh"
    elif city in ["Peshawar"]: return "KPK"
    elif city in ["Islamabad"]: return "Islamabad"
    else: return pd.NA

df["Region"] = df["City"].apply(get_region)


In [None]:
df[["Region", "City"]]

## Language Field Cleaning

In [None]:
df["Language"].unique()

In [None]:
lang_replacement_dict = { "English": "en", "urdu": "ur", "Urdu": "ur", "ENG": "en"}
df["Language"] = df["Language"].replace(lang_replacement_dict)

In [None]:
df["Language"].unique()

## Advertisement Revenue Cleaning

### Fix Different Monetary Values

In [None]:
def get_clean_revenue(d):
    f = None

    try:
        f = float(d)
        return f
    except:
        pass

    value, value_multiplier = d.split(" ")

    if value_multiplier == "lakh": return float(value) * 100_000
    elif value_multiplier == "crore": return float(value) * 10_000_000
    elif value_multiplier == "million": return float(value) * 1_000_000
    else: print(value_multiplier)

In [None]:
df["Revenue"] = df["Revenue"].apply(get_clean_revenue)

### Fix negative revenue

In [None]:
df["Revenue"] = pd.to_numeric(df['Revenue'], errors="coerce").clip(lower=0)

## Journalist Cleaning

In [None]:
df["Journalist"].unique()


In [None]:
journalist_replacement_dict = { "RAUF KLASSRA": "Rauf Klasra", "Mohsin Raza Khan": "Mohsin Raza", "K. Khan": "Kamran Khan", "shahzeb khanzada": "Shahzeb Khanzada" }
df["Journalist"] = df["Journalist"].replace(journalist_replacement_dict)

In [None]:
df["Journalist"].unique()

In [None]:
df["Journalist"] = df["Journalist"].str.title()

In [None]:
df["Journalist"].unique()

## Channel Cleaning

In [None]:
df["Channel"].unique()

In [None]:
channel_replacement_dict = {"ARYNEWS": "ARY News", "Samaa": "SAMAA TV", "DawnNews": "DAWN", "geo": "Geo News", "AbbTakk News": "Abb Takk News", "hum news": "HUM News", "Geo": "Geo News", "Express-News": "Express News", "GEO NEWS": "Geo News", "ARYNEWS": "ARY News", "ary": "ARY News", "Express": "Express News", "ARY": "ARY News"}
df["Channel"] = df["Channel"].replace(channel_replacement_dict)

In [None]:
df["Channel"].unique()

## Topic & Headline Cleaning

In [None]:
df["Topic"].unique()

In [None]:
df["Headline"].unique()

In [None]:
def get_topic(headline):
    headline = headline.lower()

    topics = {
        'Sports': ['cricket', 'football', 'hockey', 'psl', 'match', 'beat', 'defeats', 'team', 'world cup', 'tournament', 'sports', 'babar azam'],
        'Crime': ['attack', 'kill', 'murder', 'suicide', 'bomb', 'blast', 'arrest', 'raid', 'suspect', 'kidnap', 'theft', 'shooting'],
        'Health': ['hospital', 'doctor', 'medicine', 'health', 'dengue', 'polio', 'covid', 'vaccination', 'virus', 'disease', 'shortage', 'mental'],
        'Terrorism': ['terror', 'militant', 'ttp', 'bombing', 'waziristan', 'convoy', 'ispr', 'operation', 'blast'],
        'Media': ['pemra', 'journalist', 'anchor', 'channel', 'media', 'editorial', 'news', 'censorship', 'talkshow', 'report'],
        'Education': ['university', 'school', 'teacher', 'student', 'education', 'curriculum', 'exam', 'scholarship', 'hec'],
        'Judiciary': ['court', 'judge', 'justice', 'cj', 'supreme court', 'high court', 'case', 'suo moto', 'lawyer', 'bar association'],
        'Politics': ['pm', 'assembly', 'minister', 'senate', 'government', 'cabinet', 'opposition', 'election', 'party', 'politics', 'rally', 'pmln', 'ppp', 'khan'],
        'Economy': ['budget', 'deficit', 'economy', 'imf', 'trade', 'stock', 'dollar', 'revenue', 'investment', 'exports', 'imports', 'tax', 'price', 'funds', 'remittance']
    }

    for topic, keywords in topics.items():
        if any(word in headline for word in keywords):
            return topic

    return 'Other'

In [None]:
df["Topic"] = df["Headline"].apply(get_topic)

## Ratings Cleaning

In [None]:
df["Ratings"] = df["Ratings"].clip(upper=100)

## Airtime Cleaning

In [None]:
df["Airtime"][df["Airtime"] < 0]

In [None]:
df["Airtime"] = df["Airtime"].clip(lower=0)

In [None]:
df["Airtime"][df["Airtime"] < 0]

In [None]:
df["Airtime"][df["Airtime"] >= 0]

## Bias Score Cleaning

In [None]:
df["BiasScore"].unique()

In [None]:
df["BiasScore"] = df["BiasScore"].clip(lower=0, upper=10)

In [None]:
df["BiasScore"].unique()

## Viewership Cleaning

In [None]:
df["Viewership"].agg(["min","max"])

In [None]:
def get_cleaned_viewership(f):
    if np.isnan(f): return f

    return int(f)

df["Viewership"] = df["Viewership"].apply(get_cleaned_viewership)

In [None]:
df["Viewership"].agg(["min","max"])

## Shares Cleaning

In [None]:
df["Shares"].agg(["min","max"])

In [None]:
def get_cleaned_shares(f):
    if np.isnan(f): return f

    return int(f)

df["Shares"] = df["Shares"].apply(get_cleaned_shares)

In [None]:
df["Shares"].agg(["min","max"])

Technically there seems to be nothing to clean in this column

## Adspend Cleaning

In [None]:
df["AdSpend"].unique()

In [None]:
df["AdSpend"] = df["AdSpend"].apply(get_clean_revenue)

In [None]:
df["AdSpend"].unique()

## Controversy Flag

In [None]:
df["ControversyFlag"].unique()

In [None]:
controversy_flag_replacement_dict = { "No": False, "1": True, "Yes": True, "0": False }

df["ControversyFlag"] = df["ControversyFlag"].replace(controversy_flag_replacement_dict)

In [None]:
df["ControversyFlag"].unique()

## SocialMediaInteractions Cleaning

In [None]:
df["SocialMediaInteractions"].agg(["min", "max", "std", "mean"])

In [None]:
def get_cleaned_social_media_interactions(f):
    if np.isnan(f): return f

    return int(f)

df["SocialMediaInteractions"] = df["SocialMediaInteractions"].apply(get_cleaned_social_media_interactions)

In [None]:
df["SocialMediaInteractions"].agg(["min", "max", "std", "mean"])

## Handling Missing Values

In [None]:
percent_missing = df.select_dtypes(include='number').isna().mean() * 100
print(percent_missing)

In [None]:
df = df.apply(lambda col: col.fillna(col.median()) if np.issubdtype(col.dtype, np.number) else col)

In [None]:
percent_missing = df.select_dtypes(include='number').isna().mean() * 100
print(percent_missing)

# Descriptive Statistics & Plotting

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

numeric_df = df.select_dtypes(include='number')

desc_stats = numeric_df.describe().T[['mean', '50%', 'std', 'min', 'max']]
desc_stats.rename(columns={'50%': 'median'}, inplace=True)
desc_stats['variance'] = numeric_df.var()
print("----- Descriptive Statistics -----")
print(desc_stats)

ignore_columns = ["ID", "MissingDataFlag"]

for col in numeric_df.columns:
    if col in ignore_columns: continue
    plt.figure(figsize=(6, 3))
    sns.boxplot(x=numeric_df[col])
    plt.title(f"Boxplot of {col}")
    plt.tight_layout()
    plt.show()

# Investigations

In [None]:
df.dtypes

In [None]:
df

In [None]:
df["PoliticalAffiliation"].isna().sum()

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df_aff = df.dropna(subset=['PoliticalAffiliation']).copy()

preferred = ['Ratings', 'Revenue', 'Airtime', 'BiasScore', 'Viewership', 'AdSpend', 'Shares', 'SocialMediaInteractions']

available_metrics = [c for c in preferred if c in df_aff.columns]
print("Available metrics used:", available_metrics)

group_stats = df_aff.groupby('PoliticalAffiliation')[available_metrics].mean().round(3)
print("\n===== Group means by PoliticalAffiliation =====")
print(group_stats)

for col in available_metrics:
    plt.figure(figsize=(6, 3.5))
    sns.boxplot(x=df_aff[col], showfliers=True)
    sns.stripplot(x=df_aff[col], color='0.3', size=3, alpha=0.4)
    plt.title(f"Boxplot: {col}")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(6, 3.5))
    sns.boxplot(x=df_aff[col], showfliers=False)
    sns.stripplot(x=df_aff[col], color='0.3', size=3, alpha=0.4)
    plt.title(f"Boxplot: {col} (No Fliers)")
    plt.tight_layout()
    plt.show()

for col in ['Revenue', 'Ratings', 'Airtime', 'Viewership']:
    if col in df_aff.columns:
        plt.figure(figsize=(7, 4))
        sns.boxplot(data=df_aff, x='PoliticalAffiliation', y=col, showfliers=True)
        plt.title(f"{col} by PoliticalAffiliation")
        plt.tight_layout()
        plt.show()

        plt.figure(figsize=(7, 4))
        sns.boxplot(data=df_aff, x='PoliticalAffiliation', y=col, showfliers=False)
        plt.title(f"{col} by PoliticalAffiliation (No Fliers)")
        plt.tight_layout()
        plt.show()

In [None]:
n_missing = df['PoliticalAffiliation'].isna().sum()
pct_missing = df['PoliticalAffiliation'].isna().mean() * 100
print(f"Missing: {n_missing} rows ({pct_missing:.1f}%)")

print("\nMissing by Channel (top 20):")
print(df[df['PoliticalAffiliation'].isna()].groupby('Channel').size().sort_values(ascending=False).head(20))

print("\nMissing by Journalist (top 20):")
print(df[df['PoliticalAffiliation'].isna()].groupby('Journalist').size().sort_values(ascending=False).head(20))

print("\nMissing by Topic (top 20):")
print(df[df['PoliticalAffiliation'].isna()].groupby('Topic').size().sort_values(ascending=False).head(20))


# Conclusion

The conclusions from the dataset show that there is no media bias in Pakistan. However, with the amount of errors and missing values in the data, it is difficult to say if we can trust this conclusion. To achieve further clarity, a new dataset should be created with proper data entry and ensuring no missing fields exist.