# Format Courses

In [24]:
import pandas as pd

f = '../data/01-courses-result.json'
old_f = '../data/001-reference-courses.json'

In [25]:
df = pd.read_json(f)

print (df.columns)
df.head()

Index(['courseID', 'description', 'hours', 'href', 'name', 'requirements'], dtype='object')


Unnamed: 0,courseID,description,hours,href,name,requirements
0,ABRD:3010,"University of Swansea, Wales; three-week inter...",ARR s.h.,http://catalog.registrar.uiowa.edu/my-ui/cours...,Iowa Regents Semester in Wales,g.p.a. of at least 2.80
1,ABRD:3011,Unique opportunity to spend an academic year f...,ARR s.h.,http://catalog.registrar.uiowa.edu/my-ui/cours...,Iowa at Oxford,
2,ABRD:3012,Advanced undergraduate study at the University...,ARR s.h.,http://catalog.registrar.uiowa.edu/my-ui/cours...,Regents Semester in Scotland,g.p.a. of at least 3.00
3,ABRD:3013,"Course work in international economics, financ...",ARR s.h.,http://catalog.registrar.uiowa.edu/my-ui/cours...,IES London: Study London,
4,ABRD:3014,Opportunity to study global health issues in t...,ARR s.h.,http://catalog.registrar.uiowa.edu/my-ui/cours...,IES London/Jamaica Hlth Practice/Policy,


In [26]:
old_df = pd.read_json(old_f)
old_df.head()

old_df = old_df.transpose()

In [27]:
old_df.columns
old_df.rename(columns={'link': 'href', 'hrs': 'hours', 'prereqs': 'requirements'}, inplace=True)

In [28]:
get_link_src = lambda x: [i['source'] for i in x]
print (get_link_src([{'source': 'ECON:4800', 'target': 'ECON:6320'}]))

['ECON:4800']


In [29]:
old_df['requirements'] = old_df['requirements'].apply(get_link_src)

In [30]:
old_df.reset_index(inplace=True, drop=True)
old_df.head()

Unnamed: 0,college,courseID,course_num,department,description,hours,href,name,requirements
0,University College,ABRD:3011,3011,ABRD,,18,https://myui.uiowa.edu/my-ui/courses/details.p...,Iowa at Oxford ...,[]
1,University College,ABRD:3012,3012,ABRD,This program at the University of Edinburgh of...,15,https://myui.uiowa.edu/my-ui/courses/details.p...,Regents Semester in Scotland ...,[]
2,University College,ABRD:3013,3013,ABRD,,15,https://myui.uiowa.edu/my-ui/courses/details.p...,IES London: Study London ...,[]
3,University College,ABRD:3030,3030,ABRD,This program at the University College Cork (U...,12,https://myui.uiowa.edu/my-ui/courses/details.p...,Iowa Regents Semester in Ireland ...,[]
4,University College,ABRD:3045,3045,ABRD,The program in Freiburg offers students the op...,15,https://myui.uiowa.edu/my-ui/courses/details.p...,Academic Year in Freiburg ...,[]


In [31]:
# duplicates?
print (len(df))
print (len(df.courseID.unique()))

2627
1389


In [32]:
df.drop_duplicates(subset=['courseID'], inplace=True, keep='first')

### final attrs:

- [x] courseID
- [x] course num
- [x] department
- [x] formatted hours
- [x] href
- [x] description
- [ ] requirements list

In [33]:
form_department = lambda x: x.split(':')[0]
df['department'] = df['courseID'].apply(form_department)

In [34]:
form_course_num = lambda x: int(x.split(':')[1])
df['course_num'] = df['courseID'].apply(form_course_num)

In [35]:
def form_hrs(x):
    # get rid of the trailing ' s.h.'
    y = x[:-5]
    if (y != 'ARR'):
        try:
            return int(y.strip())
        except:
            z = y.split(',')[-1]
            return int(z.strip())
    else:
        return y
    
df['hours'] = df['hours'].apply(form_hrs)

In [36]:
from w3lib.html import replace_escape_chars
def form_html_text(x):
    unescaped = replace_escape_chars(x)
    single_spaced = ' '.join(unescaped.split())
    return single_spaced

df['description'] = df['description'].apply(form_html_text)

In [37]:
def get_req_candidates(x):
    if (x[0] is None):
        return []
    
    reqs = x[0]
    _id = x[1]
    
    ###
    # We could check if there's *any* hard course prereqs and leave the GPA, 
    # but thats for another day
    ###
    formatted = form_html_text(reqs)
    split = formatted.split()
    
    candidates = []
    for i in split:
        if ':' in i:
            i = i.replace(',', '')
            candidates.append(i)
            
    ###
    # Need to write logic for requirements for multiple classes
    # `CEE:4158—CEE:2150` => CEE:2150 is required for 4158, 
    # and we will need to only keep those with the appropriate courseID
    ###
    better_candidates = []
    for i in candidates:
        if '—' in i:
            split = i.split('—')
            if (_id == split[0]):
                better_candidates.append(split[1])
        else:
            better_candidates.append(i)
    
    if len(better_candidates) == 0:
        return []
    else:
        return list(set(better_candidates))
    
df['requirements'] = df[['requirements', 'courseID']].apply(get_req_candidates, axis=1)

In [10]:
df.head()

Unnamed: 0,courseID,description,hours,href,name,requirements,department,course_num
0,ABRD:3010,"University of Swansea, Wales; three-week inter...",ARR,http://catalog.registrar.uiowa.edu/my-ui/cours...,Iowa Regents Semester in Wales,[],ABRD,3010
1,ABRD:3011,Unique opportunity to spend an academic year f...,ARR,http://catalog.registrar.uiowa.edu/my-ui/cours...,Iowa at Oxford,[],ABRD,3011
2,ABRD:3012,Advanced undergraduate study at the University...,ARR,http://catalog.registrar.uiowa.edu/my-ui/cours...,Regents Semester in Scotland,[],ABRD,3012
3,ABRD:3013,"Course work in international economics, financ...",ARR,http://catalog.registrar.uiowa.edu/my-ui/cours...,IES London: Study London,[],ABRD,3013
4,ABRD:3014,Opportunity to study global health issues in t...,ARR,http://catalog.registrar.uiowa.edu/my-ui/cours...,IES London/Jamaica Hlth Practice/Policy,[],ABRD,3014


## Combine old and new DF's

In [41]:
old_slim = old_df[['courseID', 'description', 'hours', 'href', 'name', 'requirements', 'department', 'course_num']]
new_df = pd.concat([df, old_slim])

In [42]:
print (len(new_df))
new_df.drop_duplicates(subset=['courseID'], inplace=True)
print (len(new_df))

4605
4098


### Create links with valid nodes

In [44]:
# longer, more surefire way

# holds all our links
links_list = []

# define function that we'll use to create links
def create_course_links(x):
    if (x[0] is []):
        return
    
    reqs = x[0]
    _id = x[1]
    
    for req in reqs:
        link = {
            'source': req,
            'target': _id
        }
        links_list.append(link)
    
#df[['requirements', 'courseID']].apply(get_req_candidates, axis=1)
invalid_links = new_df[['requirements', 'courseID']].values.tolist()

for i in invalid_links:
    create_course_links(i)

In [59]:
valid_IDs = set(new_df['courseID'].unique().tolist())
valid_links = []

for link in links_list:
    src, trgt = link['source'], link['target']
    
    if src not in valid_IDs:
        print (link)
        continue
    elif trgt not in valid_IDs:
        print (link)
        continue
    else:
        valid_links.append(link)

{'source': 'DPA:3004', 'target': 'ABRD:3019'}
{'source': 'MUS:3004', 'target': 'ABRD:3019'}
{'source': 'ARAB:1001;', 'target': 'ARAB:3050'}
{'source': 'TRNS:2000', 'target': 'ARAB:3498'}
{'source': 'FREN:4890', 'target': 'ARAB:3498'}
{'source': 'SPAN:4980', 'target': 'ARAB:3498'}
{'source': 'TRNS:3202', 'target': 'ARAB:3498'}
{'source': 'ENGL:3724', 'target': 'ARAB:3498'}
{'source': 'TRNS:2000', 'target': 'TRNS:3498'}
{'source': 'FREN:4890', 'target': 'TRNS:3498'}
{'source': 'SPAN:4980', 'target': 'TRNS:3498'}
{'source': 'TRNS:3202', 'target': 'TRNS:3498'}
{'source': 'ENGL:3724', 'target': 'TRNS:3498'}
{'source': 'BIOC:3140;', 'target': 'BIOC:4999'}
{'source': 'ME:3040', 'target': 'CBE:5110'}
{'source': 'ME:3040', 'target': 'ME:5210'}
{'source': 'OEH:4240', 'target': 'OEH:4920'}
{'source': 'OEH:4240;', 'target': 'OEH:4220'}


In [60]:
print (len(links_list))
print (len(valid_links))

1185
1167


In [69]:
# create a map
before_links = {}
for i in valid_IDs:
    before_links[i] = []

# populate map
for i in links_list:
    if i['target'] in valid_IDs:
        if i['source'] in valid_IDs:
            course = before_links[i['target']]
            course.append(i)
    
# move the data to the correct row in DF
new_df['before'] = new_df['courseID'].apply(lambda x: before_links[x])

In [70]:
# create a map
after_links = {}
for i in valid_IDs:
    after_links[i] = []

# populate map
for i in links_list:
    if i['source'] in links_list:
        if i['target'] in valid_IDs:
            course = after_links[i['source']]
            course.append(i)
    
# move the data to the correct row in DF
new_df['after'] = new_df['courseID'].apply(lambda x: after_links[x])

### Sanity Check

See if theres any requirements in our dataframe that don't have an entry.

In [11]:
print (len(df) == len(df.courseID.unique()))

True


In [73]:
rendered_links = new_df['before'].values.tolist()

In [75]:
# [item for sublist in l for item in sublist]
rendered_links = [item for sublist in rendered_links for item in sublist]

In [77]:
for link in rendered_links:
    src, trgt = link['source'], link['target']
    
    if src not in valid_IDs:
        print (link)
        continue
    elif trgt not in valid_IDs:
        print (link)
        continue

***Nice***

Now just standardize the hours and be done!

In [81]:
new_df['hours'].unique()

array(['ARR', 3, 12, 8, 6, 9, 4, 1, 2, 5, 0, None, 7.0], dtype=object)

In [82]:
new_df['hours'] = new_df['hours'].apply(lambda x: 0 if x is None else x)

#### At long last...

In [84]:
new_df.columns

Index(['courseID', 'description', 'hours', 'href', 'name', 'requirements',
       'department', 'course_num', 'before', 'after'],
      dtype='object')

In [85]:
new_df = new_df[['courseID', 'description', 'hours', 'href', 'name',
       'department', 'course_num', 'before', 'after']]

f_out = '../data/02-rowed-courses.json'
new_df.to_json(f_out, orient='records', lines=True)

In [86]:
f_out2 = '../data/02-indexed-courses.json'
new_df.set_index('courseID', drop=False)\
    .to_json(f_out2, orient='index')