In [15]:
import pandas as pd
import numpy as np

In [2]:
raw_df = pd.read_csv('data/pakistan-media-dataset-synthetic.csv')

In [3]:
df = raw_df.copy()

# Clean Data

In [4]:
df[["Region", "City"]]

Unnamed: 0,Region,City
0,Islamabad,Multan
1,Sindh,Quetta
2,Sindh,Rawalpindi
3,Islamabad,Multan
4,Sindh,Karachi
...,...,...
14995,KPK,Karachi
14996,Punjab,Quetta
14997,Islamabad,Peshawar
14998,Sindh,Multan


## Region-City Cleaning
We can clearly see wrong regions associated with cities. Let's fix that

In [5]:
# Let's print the unique cities
df["City"].unique()

array(['Multan', 'Quetta', 'Rawalpindi', 'Karachi', 'Peshawar',
       'Islamabad', 'Lahore', 'Hyderabad'], dtype=object)

In [6]:
# Let's print the unique regions
df["Region"].unique()

array(['Islamabad', 'Sindh', 'Balochistan', 'KPK', 'Punjab', 'AJK'],
      dtype=object)

In [7]:
def get_region(city):
    if city in ["Multan", "Rawalpindi", "Lahore"]: return "Punjab"
    elif city in ["Quetta"]: return "Balochistan"
    elif city in ["Karachi", "Hyderabad"]: return "Sindh"
    elif city in ["Peshawar"]: return "KPK"
    elif city in ["Islamabad"]: return "Islamabad"
    else: return pd.NA

df["Region"] = df["City"].apply(get_region)


In [8]:
df[["Region", "City"]]

Unnamed: 0,Region,City
0,Punjab,Multan
1,Balochistan,Quetta
2,Punjab,Rawalpindi
3,Punjab,Multan
4,Sindh,Karachi
...,...,...
14995,Sindh,Karachi
14996,Balochistan,Quetta
14997,KPK,Peshawar
14998,Punjab,Multan


## Language Field Cleaning

In [9]:
df["Language"].unique()

array(['English', 'urdu', 'Urdu', 'ENG', nan], dtype=object)

In [10]:
lang_replacement_dict = { "English": "en", "urdu": "ur", "Urdu": "ur", "ENG": "en"}
df["Language"] = df["Language"].replace(lang_replacement_dict)

In [17]:
df["Language"].unique()

array(['en', 'ur', nan], dtype=object)

## Advertisement Revenue Cleaning

### Fix Different Monetary Values

In [83]:
def get_clean_revenue(d):
    f = None

    try:
        f = float(d)
        return f
    except:
        pass

    value, value_multiplier = d.split(" ")

    if value_multiplier == "lakh": return float(value) * 100_000
    elif value_multiplier == "crore": return float(value) * 10_000_000
    elif value_multiplier == "million": return float(value) * 1_000_000
    else: print(value_multiplier)

In [99]:
df["Revenue"] = df["Revenue"].apply(get_clean_revenue)

### Fix negative revenue

In [103]:
df["Revenue"] = pd.to_numeric(df['Revenue'], errors="coerce").clip(lower=0)

## Journalist Cleaning

In [107]:
df["Journalist"].unique()


array(['Najam Sethi', 'Dr. Shahid Masood', 'Kamran Shahid',
       'Matiullah Jan', 'RAUF KLASSRA', 'Ansar Abbasi', 'Umar Cheema',
       'Shahzeb Khanzada', 'shahzeb khanzada', 'Kamran Khan', 'K. Khan',
       'Asma Shirazi', 'Mohsin Raza Khan', 'Saleem Safi', 'Talat Hussain',
       'Rauf Klasra', 'Owais Tohid', 'Nusrat Javed', 'Mohsin Raza'],
      dtype=object)

In [108]:
journalist_replacement_dict = { "RAUF KLASSRA": "Rauf Klasra", "Mohsin Raza Khan": "Mohsin Raza", "K. Khan": "Kamran Khan", "shahzeb khanzada": "Shahzeb Khanzada" }
df["Journalist"] = df["Journalist"].replace(journalist_replacement_dict)

In [109]:
df["Journalist"].unique()

array(['Najam Sethi', 'Dr. Shahid Masood', 'Kamran Shahid',
       'Matiullah Jan', 'Rauf Klasra', 'Ansar Abbasi', 'Umar Cheema',
       'Shahzeb Khanzada', 'Kamran Khan', 'Asma Shirazi', 'Mohsin Raza',
       'Saleem Safi', 'Talat Hussain', 'Owais Tohid', 'Nusrat Javed'],
      dtype=object)

In [110]:
df["Journalist"] = df["Journalist"].str.title()

In [112]:
df["Journalist"].unique()

array(['Najam Sethi', 'Dr. Shahid Masood', 'Kamran Shahid',
       'Matiullah Jan', 'Rauf Klasra', 'Ansar Abbasi', 'Umar Cheema',
       'Shahzeb Khanzada', 'Kamran Khan', 'Asma Shirazi', 'Mohsin Raza',
       'Saleem Safi', 'Talat Hussain', 'Owais Tohid', 'Nusrat Javed'],
      dtype=object)

## Channel Cleaning

In [115]:
df["Channel"].unique()

array(['ARY News', 'HUM News', 'Samaa', 'DawnNews', 'SAMAA TV', 'geo',
       'AbbTakk News', 'hum news', 'Geo', 'Express-News', 'GEO NEWS',
       'ARYNEWS', 'ary', 'DAWN', 'Express', 'ARY'], dtype=object)

In [116]:
channel_replacement_dict = {"ARYNEWS": "ARY News", "Samaa": "SAMAA TV", "DawnNews": "DAWN", "geo": "Geo News", "AbbTakk News": "Abb Takk News", "hum news": "HUM News", "Geo": "Geo News", "Express-News": "Express News", "GEO NEWS": "Geo News", "ARYNEWS": "ARY News", "ary": "ARY News", "Express": "Express News", "ARY": "ARY News"}
df["Channel"] = df["Channel"].replace(channel_replacement_dict)

In [117]:
df["Channel"].unique()

array(['ARY News', 'HUM News', 'SAMAA TV', 'DAWN', 'Geo News',
       'Abb Takk News', 'Express News'], dtype=object)