In [1]:
import pandas as pd

In [2]:
raw_df = pd.read_csv('data/pakistan-media-dataset-synthetic.csv')

In [4]:
df = raw_df.copy()

# Clean Data

In [11]:
df[["Region", "City"]]

Unnamed: 0,Region,City
0,Islamabad,Multan
1,Sindh,Quetta
2,Sindh,Rawalpindi
3,Islamabad,Multan
4,Sindh,Karachi
...,...,...
14995,KPK,Karachi
14996,Punjab,Quetta
14997,Islamabad,Peshawar
14998,Sindh,Multan


## Region-City Cleaning
We can clearly see wrong regions associated with cities. Let's fix that

In [30]:
# Let's print the unique cities
df["City"].unique()

array(['Multan', 'Quetta', 'Rawalpindi', 'Karachi', 'Peshawar',
       'Islamabad', 'Lahore', 'Hyderabad'], dtype=object)

In [31]:
# Let's print the unique regions
df["Region"].unique()

array(['Islamabad', 'Sindh', 'Balochistan', 'KPK', 'Punjab', 'AJK'],
      dtype=object)

In [38]:
def get_region(city):
    if city in ["Multan", "Rawalpindi", "Lahore"]: return "Punjab"
    elif city in ["Quetta"]: return "Balochistan"
    elif city in ["Karachi", "Hyderabad"]: return "Sindh"
    elif city in ["Peshawar"]: return "KPK"
    elif city in ["Islamabad"]: return "Islamabad"
    else: return pd.NA

df["Region"] = df["City"].apply(get_region)


In [40]:
df[["Region", "City"]]

Unnamed: 0,Region,City
0,Punjab,Multan
1,Balochistan,Quetta
2,Punjab,Rawalpindi
3,Punjab,Multan
4,Sindh,Karachi
...,...,...
14995,Sindh,Karachi
14996,Balochistan,Quetta
14997,KPK,Peshawar
14998,Punjab,Multan


## Language Field Cleaning

In [43]:
df["Language"].unique()

array(['English', 'urdu', 'Urdu', 'ENG', nan], dtype=object)

In [46]:
lang_replacement_dict = { "English": "en", "urdu": "ur", "Urdu": "ur", "ENG": "en"}
df["Language"] = df["Language"].replace(lang_replacement_dict)

In [47]:
df

Unnamed: 0,ID,Journalist,Channel,Newspaper,Region,City,Topic,Headline,Ratings,Revenue,...,BiasScore,Viewership,Shares,AdSpend,ControversyFlag,MissingDataFlag,Date,Language,PoliticalAffiliation,SocialMediaInteractions
0,0,Najam Sethi,ARY News,The News,Punjab,Multan,Sports,Polio cases reported in KPK,10.613151,11648570,...,,1.756573e+06,136184.0,4452987.876,No,1.0,2021-06-21,en,Opposition,
1,1,Dr. Shahid Masood,HUM News,Dawn,Balochistan,Quetta,Crime,Budget deficit reaches record high,27.545705,5 million,...,5.0,1.546170e+06,,1 crore,1,,2019-11-19,ur,,14461.0
2,2,Kamran Shahid,Samaa,Jang,Punjab,Rawalpindi,Health,Attack on Army convoy in Waziristan,50.387473,14072656,...,,,,50 lakh,Yes,,2024-05-07,ur,Pro-Govt,33436.0
3,3,Matiullah Jan,DawnNews,Nawa-i-Waqt,Punjab,Multan,Health,Hospitals face medicine shortages,,812673,...,2.0,1.083972e+06,16712.0,1 crore,1,,2019-10-03,ur,Neutral,
4,4,RAUF KLASSRA,SAMAA TV,The News,Sindh,Karachi,Sports,Chief Justice orders suo moto on Karachi violence,,,...,,7.006350e+05,,3975468.534,,0.0,2018-02-07,ur,Neutral,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,14995,Saleem Safi,AbbTakk News,The News,Sindh,Karachi,Media,Ban on talkshows sparks outrage,93.527230,,...,,3.635476e+00,,50 lakh,Yes,0.0,2019-04-24,en,Neutral,1963.0
14996,14996,Saleem Safi,HUM News,The News,Balochistan,Quetta,Politics,Flood relief funds under scrutiny,,9191778,...,0.0,3.935510e+05,,50 lakh,,,2024-12-20,en,Neutral,
14997,14997,Shahzeb Khanzada,ARYNEWS,Jang,KPK,Peshawar,Economy,Khan addresses rally in Multan,,5 million,...,4.0,1.472680e+06,,3355467.497,No,0.0,2018-11-01,ur,Opposition,29609.0
14998,14998,Saleem Safi,DawnNews,Nawa-i-Waqt,Punjab,Multan,Politics,Aid for Balochistan still missing,23.414386,,...,3.0,,65194.0,1 crore,Yes,0.0,2023-08-01,ur,,18882.0
