In [53]:
import pandas as pd
import numpy as np
import re

In [77]:
data = pd.read_csv("data/masterstudies_usa_raw_jun1_2021.csv")
data.head()

Unnamed: 0,title,url,school,degree,pace,duration,languages,based,price
0,Master of Engineering in Logistics and Supply ...,https://www.masterstudies.com/Master-of-Engine...,Zaragoza Logistics Center,Master,Full-time,5Months,English,Online & Campus Combined,
1,MS in Information Systems,https://www.masterstudies.com/MS-in-Informatio...,University of Maryland College Park,MSc,Full-time,9-16Months,English,Campus,
2,Master of Arts in Asian Studies,https://www.masterstudies.com/Master-of-Arts-i...,The George Washington University - Elliott Sch...,MA,Full-time,2Years,English,Campus,
3,Master of Science in Athletic Training,https://www.masterstudies.com/Master-of-Scienc...,"King's College - Pennsylvania, USA",MSc,Full-time,2Years,English,Campus,
4,MS Analytics,https://www.masterstudies.com/MS-Analytics/USA...,"Kogod School of Business, American University",MSc,Full-time ...,1-3Years,English,Campus Online,


In [78]:
data = data.fillna("missing")

In [79]:
KEYWORDS = ['data', 'analysis', 'analytics', 'machine learning', 'ml', 'artificial intelligence', 'intelligence', 'statistics']

In [80]:
def keyword_present(x):

    if (any(word in x.lower() for word in KEYWORDS)):
        return 1
    return 0

def fulltime(x):

    if 'full-time' in x.lower():
        return 1
    return 0

def campus(x):

    if 'campus' in x.lower():
        return 1
    return 0

data['keyword_present'] = data['title'].apply(keyword_present)
data['fulltime_available'] = data['pace'].apply(fulltime)
data['campus_available'] = data['based'].apply(campus)
            

In [81]:
data.head()

Unnamed: 0,title,url,school,degree,pace,duration,languages,based,price,keyword_present,fulltime_available,campus_available
0,Master of Engineering in Logistics and Supply ...,https://www.masterstudies.com/Master-of-Engine...,Zaragoza Logistics Center,Master,Full-time,5Months,English,Online & Campus Combined,missing,0,1,1
1,MS in Information Systems,https://www.masterstudies.com/MS-in-Informatio...,University of Maryland College Park,MSc,Full-time,9-16Months,English,Campus,missing,0,1,1
2,Master of Arts in Asian Studies,https://www.masterstudies.com/Master-of-Arts-i...,The George Washington University - Elliott Sch...,MA,Full-time,2Years,English,Campus,missing,0,1,1
3,Master of Science in Athletic Training,https://www.masterstudies.com/Master-of-Scienc...,"King's College - Pennsylvania, USA",MSc,Full-time,2Years,English,Campus,missing,0,1,1
4,MS Analytics,https://www.masterstudies.com/MS-Analytics/USA...,"Kogod School of Business, American University",MSc,Full-time ...,1-3Years,English,Campus Online,missing,1,1,1


In [82]:
data.keyword_present.value_counts()

0    4219
1     227
Name: keyword_present, dtype: int64

In [83]:
qualified = data[(data.keyword_present == 1) & (data.fulltime_available == 1) & (data.campus_available == 1)].reset_index(drop=True)

In [84]:
qualified.shape

(87, 12)

In [85]:
qualified.head()

Unnamed: 0,title,url,school,degree,pace,duration,languages,based,price,keyword_present,fulltime_available,campus_available
0,MS Analytics,https://www.masterstudies.com/MS-Analytics/USA...,"Kogod School of Business, American University",MSc,Full-time ...,1-3Years,English,Campus Online,missing,1,1,1
1,MS in Marketing Analytics,https://www.masterstudies.com/MS-in-Marketing-...,University of Maryland College Park,MSc,Full-time,10-16Months,English,Campus,missing,1,1,1
2,Master in Applied Artificial Intelligence,https://www.masterstudies.com//Master-in-Appli...,Stevens Institute of Technology - Graduate Stu...,Master,Full-time ...,2Years,English,Campus Online,missing,1,1,1
3,Master in Business Intelligence & Analytics,https://www.masterstudies.com/Master-in-Busine...,Stevens Institute of Technology - Graduate Stu...,Master,Full-time ...,2Years,missing,Campus Online,missing,1,1,1
4,Master in Data Science,https://www.masterstudies.com/Master-in-Data-S...,Stevens Institute of Technology - Graduate Stu...,Master,Full-time ...,2Years,English,Campus,missing,1,1,1


In [86]:
qualified.pace[4]

'Full-time                                                     Part-time'

In [87]:
qualified.duration.value_counts()

2Years          30
1Years          11
missing          5
1-2Years         5
12-24Months      3
3Semesters       3
18Months         2
2-3Years         2
2-3Semesters     2
4Semesters       2
3-4Semesters     2
18-21Months      1
16-24Months      1
18-22Months      1
10Months         1
10-16Months      1
3-6Semesters     1
9Months          1
10-24Months      1
9-16Months       1
36Hours          1
1-3Years         1
16Months         1
2-5Years         1
4-5Semesters     1
9-12Months       1
5Semesters       1
4-8Semesters     1
18-48Years       1
15Months         1
16-36Months      1
Name: duration, dtype: int64

In [88]:
def rough_duration(x):

    try:
        range = re.findall(r'\d+', x)
        range = [int(r) for r in range]
        point = sum(range) / len(range)
    except:
        pass

    if 'years' in x.lower():
        point = point * 12
    elif 'month' in x.lower():
        point = point
    elif 'semester' in x.lower():
        point = ((point * 15) / 4) # 1 semester is 15 weeks in USA
    else:
        point = -1
    
    return point

qualified['point_duration'] = qualified.duration.apply(rough_duration)

In [89]:
# to check if its okay
qualified[['duration', 'point_duration']].drop_duplicates()

Unnamed: 0,duration,point_duration
0,1-3Years,24.0
1,10-16Months,13.0
2,2Years,24.0
6,3Semesters,11.25
9,1Years,12.0
17,9-16Months,12.5
19,4Semesters,15.0
20,18Months,18.0
21,18-21Months,19.5
22,2-5Years,42.0


In [90]:
qualified.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               87 non-null     object 
 1   url                 87 non-null     object 
 2   school              87 non-null     object 
 3   degree              87 non-null     object 
 4   pace                87 non-null     object 
 5   duration            87 non-null     object 
 6   languages           87 non-null     object 
 7   based               87 non-null     object 
 8   price               87 non-null     object 
 9   keyword_present     87 non-null     int64  
 10  fulltime_available  87 non-null     int64  
 11  campus_available    87 non-null     int64  
 12  point_duration      87 non-null     float64
dtypes: float64(1), int64(3), object(9)
memory usage: 9.0+ KB


In [91]:
# drop price as it has no data
qualified = qualified.drop(['price'], axis=1)

In [92]:
# filter out courses under 12 months
qualified = qualified[qualified.point_duration >= 12]

In [93]:
# only consider languages delivered in english
qualified = qualified[qualified.languages == 'English']

In [94]:
# drop the columns that were created only for filtering purposes
qualified = qualified.drop(['keyword_present', 'fulltime_available', 'campus_available', 'languages'], axis=1)

In [99]:
# drop duplicates
qualified = qualified.drop_duplicates()

In [100]:
qualified.shape

(66, 8)

In [113]:
qualified.head()

Unnamed: 0,title,url,school,degree,pace,duration,based,point_duration
0,MS Analytics,https://www.masterstudies.com/MS-Analytics/USA...,"Kogod School of Business, American University",MSc,Full-time ...,1-3Years,Campus Online,24.0
1,MS in Marketing Analytics,https://www.masterstudies.com/MS-in-Marketing-...,University of Maryland College Park,MSc,Full-time,10-16Months,Campus,13.0
2,Master in Applied Artificial Intelligence,https://www.masterstudies.com//Master-in-Appli...,Stevens Institute of Technology - Graduate Stu...,Master,Full-time ...,2Years,Campus Online,24.0
4,Master in Data Science,https://www.masterstudies.com/Master-in-Data-S...,Stevens Institute of Technology - Graduate Stu...,Master,Full-time ...,2Years,Campus,24.0
5,Master in Machine Learning,https://www.masterstudies.com/Master-in-Machin...,Stevens Institute of Technology - Graduate Stu...,Master,Full-time ...,2Years,Campus Online,24.0


In [114]:
qualified.to_csv('data/masterstudies_usa_filtered_jun1_2021.csv', index=False)