# Media Bias / Fact Check (MBFC)

- https://mediabiasfactcheck.com/

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
base_url = "https://mediabiasfactcheck.com/"

categories = ["center", "left", "leftcenter", "right-center", "right"]

## Collect URLs from Media Bias / Fact Check (MBFC)

In [3]:
mbfc_df = pd.DataFrame(columns=["title_full", "url", "category"])

print(f"Categories to be collected: {categories}")

for category in categories:
    url = base_url + category
    print(f"Collecting URLs from: {url}", end=" ... ")
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    
    table = soup.find(name="table", attrs={"id":"mbfc-table"})
    table_rows = table.find_all("td")

    title_to_url = {}

    for row in table_rows:
        title_full = row.text
        try:
            url = row.find("a").get("href")
        except:
            url = ""
        title_to_url[title_full] = url
        
    df_temp = pd.DataFrame(pd.Series(title_to_url)).reset_index().rename({"index":"title_full", 0:"url"}, axis=1)
    df_temp["category"] = category
    
    mbfc_df = pd.concat([mbfc_df, df_temp]).reset_index(drop=True)
    print("Completed!")
print("All the URLs are collected!!!")

Categories to be collected: ['center', 'left', 'leftcenter', 'right-center', 'right']
Collecting URLs from: https://mediabiasfactcheck.com/center ... Completed!
Collecting URLs from: https://mediabiasfactcheck.com/left ... Completed!
Collecting URLs from: https://mediabiasfactcheck.com/leftcenter ... Completed!
Collecting URLs from: https://mediabiasfactcheck.com/right-center ... Completed!
Collecting URLs from: https://mediabiasfactcheck.com/right ... Completed!
All the URLs are collected!!!


In [4]:
mbfc_df[mbfc_df["url"] == ""]

Unnamed: 0,title_full,url,category
0,\n\n,,center
63,\n\n,,center
1353,\n\n\n\n\n,,left
1404,\n\n,,left
1751,\n\n,,leftcenter
2833,\n\n,,right-center
3399,\n\n,,right


In [5]:
# Drop rows with empty URL value
mbfc_df = mbfc_df[mbfc_df["url"] != ""].reset_index(drop=True)

In [6]:
mbfc_df["title_slug"] = mbfc_df["url"].apply(lambda x: x.split("/")[3])
mbfc_df["title"] = mbfc_df["title_full"].apply(lambda x: x.split("(")[0].strip())

mbfc_df = mbfc_df[["title_full", "title", "title_slug", "url", "category"]]

In [7]:
mbfc_df.head(3)

Unnamed: 0,title_full,title,title_slug,url,category
0,9News – KUSA (9news.com),9News – KUSA,9news-kusa,https://mediabiasfactcheck.com/9news-kusa/,center
1,11 News – KKCO (nbc11news.com),11 News – KKCO,11-news-kkco,https://mediabiasfactcheck.com/11-news-kkco/,center
2,12 News KPNX (12news.com),12 News KPNX,12-news-kpnx,https://mediabiasfactcheck.com/12-news-kpnx/,center


In [8]:
mbfc_df.tail(3)

Unnamed: 0,title_full,title,title_slug,url,category
3600,World Magazine (world.wng.org),World Magazine,world-magazine,https://mediabiasfactcheck.com/world-magazine/,right
3601,Yellow Hammer News (yellowhammernews.com),Yellow Hammer News,yellowhammer-news,https://mediabiasfactcheck.com/yellowhammer-news/,right
3602,Young Americas Foundation (YAF) (www.yaf.org),Young Americas Foundation,young-americas-foundation-yaf,https://mediabiasfactcheck.com/young-americas-...,right


In [9]:
mbfc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3603 entries, 0 to 3602
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title_full  3603 non-null   object
 1   title       3603 non-null   object
 2   title_slug  3603 non-null   object
 3   url         3603 non-null   object
 4   category    3603 non-null   object
dtypes: object(5)
memory usage: 140.9+ KB


In [10]:
mbfc_df["category"].value_counts()

center          1317
leftcenter      1077
right-center     598
left             365
right            246
Name: category, dtype: int64

In [11]:
mbfc_df.to_csv("data/mbfc_urls.csv", index=False, encoding="utf-8")
mbfc_df.to_excel("data/mbfc_urls.xlsx", index=False, encoding="utf-8")

## Collect Bias Info from URLs

In [12]:
bias_df = pd.DataFrame()

for i, url in enumerate(mbfc_df["url"].values):
    if i % 50 == 0:
        print(f"{i:,}/{mbfc_df.shape[0]:,} | {i/mbfc_df.shape[0]*100:.2f}%")
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text)
        rows = [item for item in soup.find_all("p") if "Bias Rating:" in item.text or "Factual Reporting:" in item.text][-1].text.split("\n")
        bias_temp = pd.DataFrame(pd.Series({"url":url} | {'_'.join(row.split(":")[0].lower().split(" ")): row.split(":")[1].strip() for row in rows})).T
        bias_df = pd.concat([bias_df, bias_temp]).reset_index(drop=True)
    except Exception as e:
        print("-"*60)
        print(f"{i}- {url}")
        print(f"Error:\n{e}")
        print("-"*60)
    
print(f"{mbfc_df.shape[0]:,}/{mbfc_df.shape[0]:,} | {mbfc_df.shape[0]/mbfc_df.shape[0]*100:.2f}%")
print("COMPLETED!!!")

0/3,603 | 0.00%
50/3,603 | 1.39%
100/3,603 | 2.78%
150/3,603 | 4.16%
200/3,603 | 5.55%
250/3,603 | 6.94%
300/3,603 | 8.33%
350/3,603 | 9.71%
400/3,603 | 11.10%
450/3,603 | 12.49%
500/3,603 | 13.88%
------------------------------------------------------------
544- https://mediabiasfactcheck.com/ktft-twin-falls-news-bias/
Error:
list index out of range
------------------------------------------------------------
------------------------------------------------------------
546- https://mediabiasfactcheck.com/ktka-topeka-news-bias/
Error:
list index out of range
------------------------------------------------------------
------------------------------------------------------------
547- https://mediabiasfactcheck.com/ktla/
Error:
list index out of range
------------------------------------------------------------
------------------------------------------------------------
548- https://mediabiasfactcheck.com/ktmf-missoula-news-bias/
Error:
list index out of range
--------------------------

2,250/3,603 | 62.45%
------------------------------------------------------------
2287- https://mediabiasfactcheck.com/morrison-county-record-bias/
Error:
list index out of range
------------------------------------------------------------
------------------------------------------------------------
2288- https://mediabiasfactcheck.com/moscow-times/
Error:
list index out of range
------------------------------------------------------------
------------------------------------------------------------
2290- https://mediabiasfactcheck.com/mother-nature-network/
Error:
list index out of range
------------------------------------------------------------
------------------------------------------------------------
2292- https://mediabiasfactcheck.com/moultrie-observer-bias/
Error:
list index out of range
------------------------------------------------------------
2,300/3,603 | 63.84%
2,350/3,603 | 65.22%
2,400/3,603 | 66.61%
------------------------------------------------------------
2426-

In [13]:
bias_df.head(3)

Unnamed: 0,url,bias_rating,factual_reporting,country,press_freedom_rating,media_type,traffic/popularity,mbfc_credibility_rating,mbfc’s_country_freedom_rating,mbfc’s_freedom_rating,press_freedom_rank,world_press_freedom_rank,mbfc’s_country_freedom_rank,mbfc’s_country_freedom_profile,press_freedom_rating_,mbfc_country_freedom_rating,reasoning,_world_press_freedom_rank,questionable_reasoning
0,https://mediabiasfactcheck.com/9news-kusa/,LEAST BIASED,HIGH,USA,MOSTLY FREE,TV Station,High Traffic,HIGH CREDIBILITY,,,,,,,,,,,
1,https://mediabiasfactcheck.com/11-news-kkco/,LEAST BIASED,HIGH,USA,MOSTLY FREE,TV Station,Medium Traffic,HIGH CREDIBILITY,,,,,,,,,,,
2,https://mediabiasfactcheck.com/12-news-kpnx/,LEAST BIASED,HIGH,USA,MOSTLY FREE,TV Station,Medium Traffic,HIGH CREDIBILITY,,,,,,,,,,,


In [14]:
bias_df.tail(3)

Unnamed: 0,url,bias_rating,factual_reporting,country,press_freedom_rating,media_type,traffic/popularity,mbfc_credibility_rating,mbfc’s_country_freedom_rating,mbfc’s_freedom_rating,press_freedom_rank,world_press_freedom_rank,mbfc’s_country_freedom_rank,mbfc’s_country_freedom_profile,press_freedom_rating_,mbfc_country_freedom_rating,reasoning,_world_press_freedom_rank,questionable_reasoning
3550,https://mediabiasfactcheck.com/world-magazine/,RIGHT,MIXED,USA,,Magazine,Medium Traffic,MEDIUM CREDIBILITY,,,MOSTLY FREE,,,,,,,,
3551,https://mediabiasfactcheck.com/yellowhammer-news/,RIGHT,HIGH,USA,MOSTLY FREE,Website,Minimal Traffic,HIGH CREDIBILITY,,,,,,,,,,,
3552,https://mediabiasfactcheck.com/young-americas-...,RIGHT,MIXED,USA,,Organization/Foundation,High Traffic,MEDIUM CREDIBILITY,,,MOSTLY FREE,,,,,,,,


In [15]:
bias_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3553 entries, 0 to 3552
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   url                             3553 non-null   object
 1   bias_rating                     3428 non-null   object
 2   factual_reporting               3553 non-null   object
 3   country                         3546 non-null   object
 4   press_freedom_rating            2042 non-null   object
 5   media_type                      3428 non-null   object
 6   traffic/popularity              3428 non-null   object
 7   mbfc_credibility_rating         3428 non-null   object
 8   mbfc’s_country_freedom_rating   469 non-null    object
 9   mbfc’s_freedom_rating           9 non-null      object
 10  press_freedom_rank              756 non-null    object
 11  world_press_freedom_rank        119 non-null    object
 12  mbfc’s_country_freedom_rank     69 non-null     

In [16]:
bias_df.isna().sum()

url                                  0
bias_rating                        125
factual_reporting                    0
country                              7
press_freedom_rating              1511
media_type                         125
traffic/popularity                 125
mbfc_credibility_rating            125
mbfc’s_country_freedom_rating     3084
mbfc’s_freedom_rating             3544
press_freedom_rank                2797
world_press_freedom_rank          3434
mbfc’s_country_freedom_rank       3484
mbfc’s_country_freedom_profile    3552
press_freedom_rating_             3552
mbfc_country_freedom_rating       3546
reasoning                         3552
_world_press_freedom_rank         3552
questionable_reasoning            3551
dtype: int64

In [17]:
bias_df.to_csv("data/bias_data.csv", index=False, encoding="utf-8")
bias_df.to_excel("data/bias_data.xlsx", index=False, encoding="utf-8")