# Fixing Minor Problems with the Data

> *NOTE: This notebook has been rerun after clearing up the initial issues with data. Therefore, these initial issues with the collected data are not recoverable. However, the code here could still help identify the issues that this notebook intended to (and solved) with the data.*

In [1]:
import os

import pandas as pd

In [2]:
data_path = "../data_collection/data/processed"

pgm_india = pd.read_csv(os.path.join(data_path, "india_pgms.csv"))
pgm_usa = pd.read_csv(os.path.join(data_path, "usa_pgms.csv"))

curr_india = pd.read_excel(os.path.join(data_path, "curriculum_india.xlsx"))
curr_usa = pd.read_excel(os.path.join(data_path, "curriculum_usa.xlsx"))
curr_usa.drop(['Unnamed: 4', 'Unnamed: 5'], inplace=True, axis=1)

## India

In [3]:
print(pgm_india.info())
print(curr_india.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   uni_name                  33 non-null     object
 1   pgm_name                  33 non-null     object
 2   dur_yrs                   33 non-null     int64 
 3   type                      33 non-null     object
 4   national_rank_qs          33 non-null     int64 
 5   dept_involved             33 non-null     object
 6   location                  33 non-null     object
 7   url                       33 non-null     object
 8   descr                     29 non-null     object
 9   header_tag                21 non-null     object
 10  header_names              21 non-null     object
 11  links                     21 non-null     object
 12  date_pub                  33 non-null     object
 13  date_last_mod             33 non-null     object
 14  html_tags                 21

In [4]:
print(len(pgm_india.url.unique()))
print(len(curr_india.url.unique()))

33
33


In [5]:
url_pgms = pgm_india.url
url_curr = curr_india.url

len(set(url_pgms).intersection(set(url_curr)))

33

In [6]:
set(url_pgms) - set(url_curr)

set()

In [7]:
set(url_curr) - set(url_pgms)

set()

1. Both the above links refer to the same program webpage
2. Therefore, they are encoded as the same

In [8]:
# replace the url in pgm_india

pgm_india.url = pgm_india.url.replace({
    'https://www.amity.edu/course-details.aspx?fd=FzNymoX3dH0=&cfn=B0SvGFGgVjeUke0/xjPA4g==&CD=B0SvGFGgVjeUke0/x#aspnetForm > sectionjPA4g==': 'https://www.amity.edu/course-details.aspx?fd=FzNymoX3dH0=&cfn=B0SvGFGgVjeUke0/xjPA4g==&CD=B0SvGFGgVjeUke0/xjPA4g=='
})


## USA

In [9]:
print(len(pgm_usa.url.unique()))
print(len(curr_usa.url.unique()))

81
83


In [10]:
url_pgms = pgm_usa.url
url_curr = curr_usa.url

len(set(url_pgms).intersection(set(url_curr)))

81

In [11]:
set(url_pgms) - set(url_curr)

set()

In [12]:
set(url_curr) - set(url_pgms)

{'https://mccourt.georgetown.edu/master-of-science-in-data-science-for-public-policy/#_ga=2.261589773.1422061659.1623151628-565262736.1623151629',
 'https://mccourt.georgetown.edu/master-of-science-in-data-science-for-public-policy/#_ga=2.261589773.1422061659.1623151628-565262736.1623151630'}

1. All the links are of the same program webpage
2. Replace the curriculum urls with the url of the program in pgm_usa

In [13]:
# replace the url in pgm_india

curr_usa.url = curr_usa.url.replace({
    'https://mccourt.georgetown.edu/master-of-science-in-data-science-for-public-policy/#_ga=2.261589773.1422061659.1623151628-565262736.1623151630': 'https://mccourt.georgetown.edu/master-of-science-in-data-science-for-public-policy/#_ga=2.261589773.1422061659.1623151628-565262736.1623151628',
    'https://mccourt.georgetown.edu/master-of-science-in-data-science-for-public-policy/#_ga=2.261589773.1422061659.1623151628-565262736.1623151629': 'https://mccourt.georgetown.edu/master-of-science-in-data-science-for-public-policy/#_ga=2.261589773.1422061659.1623151628-565262736.1623151628'
})


In [14]:
set(url_curr) - set(url_pgms)

set()

In [15]:
# save files

# india
pgm_india.to_csv(os.path.join(data_path, "india_pgms.csv"), index=False)
curr_india.to_csv(os.path.join(data_path, "india_curr.csv"), index=False)

# usa
pgm_usa.to_csv(os.path.join(data_path, "usa_pgms.csv"), index=False)
curr_usa.to_csv(os.path.join(data_path, "usa_curr.csv"), index=False)