# Combine and Format Courses

In [1]:
import pandas as pd
from os import listdir

In [2]:
raw_classes = listdir('../data/courses/')
combined_classes = pd.concat([pd.read_json('../data/courses/'+i, lines=True) for i in raw_classes])

In [12]:
combined_classes.to_json('../data/raw-courses.json', orient='records')

In [13]:
print (combined_classes.columns)
combined_classes.head()

Index(['Applicable Fees', 'Approved GE', 'Co-Existing Sections',
       'Corequisites', 'Course', 'Cross References', 'Delivery Modes',
       'Delivery Tools', 'Department', 'Description', 'General Catalog',
       'Instructors', 'Link', 'Management Type', 'Midterm Exam(s)',
       'Minimum Fee Hours', 'Name', 'Prerequisites', 'Professional Exam(s)',
       'Recommendations', 'Registration Information', 'Repeatable',
       'Requirements', 'Restrictions', 'Screenings', 'Seat Reservations',
       'Special Grading Instructions', 'Textbooks and Resources',
       'Time and Location', 'Type', 'courseID', 'college', 'department',
       'course_num', 'name', 'prereqs', 'hrs'],
      dtype='object')


Unnamed: 0,Applicable Fees,Approved GE,Co-Existing Sections,Corequisites,Course,Cross References,Delivery Modes,Delivery Tools,Department,Description,...,Textbooks and Resources,Time and Location,Type,courseID,college,department,course_num,name,prereqs,hrs
0,,,,,EPLS:3000:0001,,,,College of Education\n \n ...,This course offers an examination of foundatio...,...,The following textbook and resource informatio...,Start and end times: \n12:30P - 1:45P\nTTh\n10...,Lecture,EPLS:3000,College of Education,EPLS,3000,Foundations of Education,[],3.0
1,,,,,EPLS:3000:0002,,,,College of Education\n \n ...,This course offers an examination of foundatio...,...,The following textbook and resource informatio...,Start and end times: \n9:30A - 10:45A\nTTh\n10...,Lecture,EPLS:3000,College of Education,EPLS,3000,Foundations of Education,[],3.0
2,,,,,EPLS:3000:0003,,,,College of Education\n \n ...,This course offers an examination of foundatio...,...,The following textbook and resource informatio...,Start and end times: \n12:30P - 1:45P\nTTh\n10...,Lecture,EPLS:3000,College of Education,EPLS,3000,Foundations of Education,[],3.0
3,,Students admitted Summer 2017 or after\n\nValu...,,,EPLS:4180:0AAA,,,,College of Education\n \n ...,"The focus of this course, which is required fo...",...,The following textbook and resource informatio...,Start and end times: \n9:30A - 10:20A\nM\nW107...,Lecture,EPLS:4180,College of Education,EPLS,4180,Human Relations for the Classroom Teachr,[],
4,,Students admitted Summer 2017 or after\n\nValu...,,,EPLS:4180:0A02,,,,College of Education\n \n ...,"The focus of this course, which is required fo...",...,No textbooks required,Start and end times: \n9:30A - 10:45A\nTTh\nN2...,Discussion,EPLS:4180,College of Education,EPLS,4180,Human Relations for the Classroom Teachr,[],3.0


**Formatting**

In [14]:
# each course should have courseID
def format_courseID(course):
    split_name = course.split(':')[:2]
    return ":".join(split_name)


# department -> college
def format_college(x):
    if (x is not None):
        return x.split('\n')[0]
    else:
        return x
    
def format_dep(x):
    if (x is not None):
        return x.split(':')[0]
    else:
        return x

# course #
def format_course_num(x):
    if (x is not None):
        return x.split(':')[1]
    else:
        return x

# name
def format_name(x):
    human = x.split(' ', 1)[1].strip()
    return human.replace('\n', '').strip()

def format_hrs(x):
    if (x is None):
        return 0
    else:
        return x

# prereqs
def format_prereqs(prereq):
    if (prereq == None):
        return []
    elif (type(prereq) != str):
        prereq = str(prereq)
    elif prereq.startswith('None'):
        return []
    
    words = prereq.split()
    courses = [word.replace('(', '').replace(')', '') for word in words if ':' in word]
    
    return courses

# coreqs

In [15]:
# courseID
combined_classes['courseID'] = combined_classes['Course'].apply(format_courseID)

# college
combined_classes['college'] = combined_classes['Department'].apply(format_college)

# department
combined_classes['department'] = combined_classes['Course'].apply(format_dep)

# course num
combined_classes['course_num'] = combined_classes['Course'].apply(format_course_num)

# course name
combined_classes['name'] = combined_classes['Name'].apply(format_name)

# pre reqs
combined_classes['prereqs'] = combined_classes['Prerequisites'].apply(format_prereqs)

# hrs
combined_classes['hrs'] = combined_classes['Minimum Fee Hours'].apply(format_hrs)

In [16]:
output_df = combined_classes[['courseID', 'name', 'college', 'department', 'course_num', 'prereqs', 'hrs', 'Description', 'Link']]
output_df.head()

Unnamed: 0,courseID,name,college,department,course_num,prereqs,hrs,Description,Link
0,EPLS:3000,Foundations of Education,College of Education,EPLS,3000,[],3.0,This course offers an examination of foundatio...,https://myui.uiowa.edu/my-ui/courses/details.p...
1,EPLS:3000,Foundations of Education,College of Education,EPLS,3000,[],3.0,This course offers an examination of foundatio...,https://myui.uiowa.edu/my-ui/courses/details.p...
2,EPLS:3000,Foundations of Education,College of Education,EPLS,3000,[],3.0,This course offers an examination of foundatio...,https://myui.uiowa.edu/my-ui/courses/details.p...
3,EPLS:4180,Human Relations for the Classroom Teachr,College of Education,EPLS,4180,[],,"The focus of this course, which is required fo...",https://myui.uiowa.edu/my-ui/courses/details.p...
4,EPLS:4180,Human Relations for the Classroom Teachr,College of Education,EPLS,4180,[],3.0,"The focus of this course, which is required fo...",https://myui.uiowa.edu/my-ui/courses/details.p...


In [17]:
# get rid of duplicates
print ("OG: {}".format(output_df.shape))

output_df.drop_duplicates(keep='last', inplace=True, subset=['courseID', 'name'])

print ("New: {}".format(output_df.shape))

OG: (7048, 9)
New: (3482, 9)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [18]:
output_df.head()

Unnamed: 0,courseID,name,college,department,course_num,prereqs,hrs,Description,Link
2,EPLS:3000,Foundations of Education,College of Education,EPLS,3000,[],3.0,This course offers an examination of foundatio...,https://myui.uiowa.edu/my-ui/courses/details.p...
6,EPLS:4180,Human Relations for the Classroom Teachr,College of Education,EPLS,4180,[],3.0,"The focus of this course, which is required fo...",https://myui.uiowa.edu/my-ui/courses/details.p...
7,EPLS:4200,Diversity and Inclusion in Athletics,College of Education,EPLS,4200,[],3.0,Certificate for Interscholastic Athletics/Acti...,https://myui.uiowa.edu/my-ui/courses/details.p...
9,EPLS:5090,Instr Coaching for Teaching Excellence,College of Education,EPLS,5090,[],3.0,This section is offered through Distance and O...,https://myui.uiowa.edu/my-ui/courses/details.p...
10,EPLS:5100,Issues and Policies in Higher Education,College of Education,EPLS,5100,[],3.0,"Current selected functions, issues, policies o...",https://myui.uiowa.edu/my-ui/courses/details.p...


In [19]:
base_f = '../data/formatted-courses.json'
output_df.to_json(base_f, orient='records')