In [1]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from IPython.display import display
import time

## Organization Strategy: How Can We Store Info on Polyratings?

### Entities & Features
 
 Entities/Objects
 * Professors
 * Reviews 
 * Courses
 
 Attributes/Features
 * Grades earned by reviewers
 * The Year level (Freshman, Sophomore, etc.) of the reviewers
 * Course
 * Department
 * Professor Abilities 
     * Overall Rating
     * Presents Material Clearly
     * Understands Student Difficulties
 * Number of Reporting Evaluations/Reviews

### review_id:
Many of the statistics are based on the individual sample reviews. These are conveniently accompanied by a 'comment id' in the html of Polyratings.com. 

### prof_id: 
The professors each have their own review page and full statistical analysis page. Both of the urls for these pages end with a unique number. Let's use this as a professor id.

In [2]:
def get_prof_id(url): 
    '''
    Finds a Professor's Id, as specified in the url.
    Example: with the url: http://polyratings.com/eval.phtml?profid=2073
                prof_id is: 2073
    
    Args: 
        url (string)
    Returns:
        prof_id (int)
        
    '''
    prof_id = url.split('profid=')[-1]
    return int(prof_id)

# Start by Parsing Aggregate Data

In [3]:
def clean(words, letter_or_year=True, prof_id=0): 
    '''
    Cleans the Statistics from by Year and By Letter Grade
    
    Args: 
        words: (list): contains strings with the statistics inside.
        
        letter_or_year (bool): 
            True if parsing ratings by letter grades
            False if parsing ratings by year level
            
    Returns: 
        grades (dict): a dictionary of dictionaries that hold statistics. 
    
    '''
    
    grade_section = []
    
    if letter_or_year:
        for item in words: 
            #This is what we split on
            item = item.split('Receiving a Grade of \'')
            grade_section.append(item)
        
        
        grades = dict.fromkeys(['A', 'B', 'C', 'Credit','D', 'F', 'No Credit', 'N/A'])
       
    else: 
        for item in words: 
            #This is what we split on
            item = item.split('With a \'')
            grade_section.append(item)
            
        grades = dict.fromkeys(['Freshman', 'Sophomore', 'Junior', \
                                'Senior', '5th Year Senior', 'Graduate Student'])
        
    if len(grade_section) > 0:
        grade_section = grade_section[0] #The rest is negligible.

        #grade_section is now a list of strings
        for item in grade_section: 
            if letter_or_year:
                key=item[0]

                # We were only looking at the first character
                # We need the first 3 for N/A dict key
                if key == 'N':
                    if item[1] == 'o':
                        key = 'No Credit'
                    else:
                        key = 'N/A'

                if key == 'C' and item[1] == 'r':
                    key = 'Credit'

            else: 
                key = item.split('\' ')[0]

            if key in grades.keys(): 

                # Parse out the extra text strings. 
                # Only keep the ones with colons.
                start = 2
                end = 6
                stats = item.replace('\xa0', '').split('\n')[start: end]

                #Make a dict of each statistic and it's description
                d = dict(stat.split(":") for stat in stats)

                #Parse the types correctly
                for k in d.keys():
                    try:
                        d[k] = int(d[k])
                    except ValueError:
                        d[k] = float(d[k])

                #Assign the statistic dictionaries into the larger dictionary.
                grades[key] = d
    #             df.columns = df.columns.str.replace('Overall Grade','Overall Rating')
    else: 
        print('Professor ___ has nothing: ', prof_id)
    
    return grades #a dictionary of dictionaries

This function helps ensure that values are the correct type.

In [4]:
def parse_types(values):
    ''' Converts strings to floats or integers.
    Args: 
        list of strings
    Returns: 
        list containing floats and integers.
    '''
    
    parsed = []
    for value in values: 
        if '.' in value: 
            value = float(value)
        elif value == '': 
            value = 0
        else: 
            value = int(value)
        parsed.append(value)
        
    return parsed

# Scrape Statistics
get_sample_stats extracts the mean, median, mode, and standard deviation for a professor on his/her full statistical analysis page. Here is an example page: http://polyratings.com/stats.phtml?profid=2073

In [5]:
def get_sample_stats(table_tag):
    
    '''
        Extracts mean, median, mode, and standard deviation for each professor. 
        
        Args: 
            table (td tag): holds tags within with data.
        
        Returns: 
            a DataFrame holding the statistics. 
    
    '''
    
    b_tags = table_tag.findAll('b')
    tds = []
    
    #Find td tags that follow b tags
    for b in b_tags: 
        tds.append(b.findNext('td'))
        
    #Just need the last four items.
    tds = tds[-4:]
    
    tds = [stat.text for stat in tds]
    
    stat_d = {}
    stat_d['mean'], stat_d['median'], stat_d['mode'], stat_d['std_dev'] = \
        parse_types(tds) 

    return pd.DataFrame.from_dict(data=stat_d, orient='index')

# Scrape Professor's Overall Statistics: 
## * Overall rating
## * Ability to present material clearly
## * Ability to understand student difficulties

In [12]:
def get_prof_stats(full_stat_url):
    '''Returns a list of DataFrames of statistics scraped from a professor's
        full stat analysis url
    
        Args: 
            full_stat_url (string): the link to the professor's polyratings
        
        Returns:
            list 
            [prof_df (DataFrame): holds general info about professor
            prof_stats DataFrame]: holds statistics about professor
    
    '''
    
    prof_df = {}
    prof_stats = {}
    
    r = requests.get(full_stat_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # Professor Name and Department(s)
    fonts = soup.findAll('font') #each is type <class 'bs4.element.Tag'>
    name_font = fonts[2].text.replace('\n', ' ') 
    dept_font = fonts[3].text.replace('\n', ' ').split(' Department')[0]
    
    prof_df['Name'] = name_font
    prof_df['Dept'] = dept_font
    
    prof_id = get_prof_id(full_stat_url)
    prof_df['prof_id'] = prof_id
    
    # Professor Statistics
    stats = soup.findAll('table') 
    #we'll have to sort through different ones.
    # stats[2] is the name and dept table
    
    #Sections of HTML
    overall_t = stats[3]
    ability_to_present_t = stats[4]
    ability_to_understand_t = stats[5]
    #Total number of evaluations
    prof_df['Reporting Evaluations'] = int(overall_t.findAll('td', width='15%')[0].text)

    #Get Sample Stats for each table
    overall_sample_stats = get_sample_stats(overall_t)
    ability_to_present_sample_stats = get_sample_stats(ability_to_present_t)
    ability_to_understand_sample_stats = get_sample_stats(ability_to_understand_t)

    prof_stats = pd.merge(left=overall_sample_stats, \
                          right=ability_to_present_sample_stats, \
                         left_index=True, right_index=True, how='outer')
    prof_stats = pd.merge(left=prof_stats, right=ability_to_understand_sample_stats, \
                         left_index=True, right_index=True, how='outer')
    
    prof_stats = pd.DataFrame.transpose(prof_stats)
    prof_stats['rating_type'] = ['Overall Rating', 'Presents Material Clearly', \
                                 'Recognizes Student Difficulties']
    prof_stats['prof_id'] = prof_id
    prof_stats.index = ([0, 1, 2])
    
    prof_stats[['median', 'mode']] = prof_stats[['median', 'mode']].astype(int)
    prof_df = pd.DataFrame(prof_df, index=[0]).set_index(['prof_id'])

    # A list of dataframes that describe a professor
    return [prof_df, prof_stats] 

# Scrape data from a full statistical analysis page including:
## * Overall statistics (by calling get_prof_stats), 
## * Mean rating grouped by grade earned, and 
## * Mean rating grouped by reviewers' year level.

In [13]:
def get_ratings(url):
    '''
    Calls functinos to get a list of DataFrames representing statistics for a professor. 
    
    Args: 
        url (string), the professor's statistical analysis page.
    Returns: 
        A list of DataFrames: 
         * prof_df: 
             the Professor DataFrame specifying name, number of evaluations, department
         * prof_stats:
             the Professor's Overall Rating Statistics
         * letter_grade_ratings:
             the Professor's Rating Statistics Based on Reviewer's Letter Grade Earned
         * year_grade_ratings:
             the Professor's Rating Statistics Based on Reviewer's Year Grade Earned
    
    '''
    
    prof_id = get_prof_id(url)
    print('\nProfessor: ', prof_id)
    
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    children = soup.getText().split('Polyratings.com')
    letter_grade_evals = children[3:4]
    year_grade_evals = children[4:5]
    
#     print("Letter_grade_evals in get_ratings()", letter_grade_evals)
    
    if len(letter_grade_evals) > 0:
    
        letter_grade_ratings = \
            pd.DataFrame(clean(letter_grade_evals, letter_or_year=True, prof_id=prof_id))
            
        letter_grade_ratings = \
            letter_grade_ratings.rename(index={'Overall Grade': 'Overall Rating'})
            
        letter_grade_ratings['prof_id'] = prof_id

        year_grade_ratings = \
            pd.DataFrame(clean(year_grade_evals, letter_or_year=False, prof_id=prof_id))
            
        year_grade_ratings = \
            year_grade_ratings.rename(index={'Overall Grade': 'Overall Rating'})
            
        year_grade_ratings['prof_id'] = prof_id

        #Get the mean, median, mode, std_dev of all ratings
        prof_df, prof_stats = get_prof_stats(url)

        return [prof_df, prof_stats, letter_grade_ratings, year_grade_ratings]
    else: 
        return ValueError()

# Collect Data on Each Professor
## * Convert professor page urls to professor full statistical analysis page urls. 
## * Loop through and get_ratings on each professor.

In [14]:
def get_full_stat_urls(professor_paths): 
    '''Helper function, gets links to professors' full statistical analysis pages. 
        
       Args: 
           professor_paths (list of strings): 
               each is a link to a prof's polyratings page.
        
       Returns: 
           full_stat_paths (list of strings): each is a link to a prof's 
               full stat analysis page.
    '''
    
    # The trick is that the prof url might be: 
        # http://polyratings.com/eval.phtml?profid=2073
        
    # while the corresponding full stat analysis url is: 
        # http://polyratings.com/stats.phtml?profid=2073
        
    # Replace the word 'stats' with 'stats'
    
    return [p.replace('eval', 'stats') for p in professor_paths]


def get_all_professors_stats(professor_urls):

    '''Helper function: 
       Slowly collects data on each professor on polyratings.
       Calls parser functions to clean the data.
       
       Args: 
           professor_urls (list): links to professors' polyratings pages.
       Returns: 
           info from get_ratings(full_stat_url) for each professor
           all_prof_stats_dfs (list of lists of DataFrames)
               each item in list is a list containing 4 DataFrames
                    [[first_prof_df, 
                    first_prof_stats, 
                    first_prof_stats_by_letter, 
                    first_prof_stats_by_grade]
                    
                    [second_prof_df, 
                    second_prof_stats, 
                    second_prof_stats_by_letter, 
                    second_prof_stats_by_grade]
                    ...]
            
    '''
    full_stat_urls = get_full_stat_urls(professor_urls)
    all_prof_stats_dfs = []
    
    #Every so often, call data scraping functions on each url.
    for url in full_stat_urls:
        time.sleep(1.0)
        
        full_stat_url = url.replace('eval', 'stats')
        
        #Get the basic prof info as well as mean, median, mode, std_dev of all ratings
        all_prof_stats_dfs.append(get_ratings(full_stat_url))
        
    return all_prof_stats_dfs


## Start by scraping a list of professor page urls.

In [55]:
professor_list_url = "http://polyratings.com/list.phtml"

def get_professor_urls():
    '''Gets links to professor polyratings pages. 
    
       Returns: 
           professor_paths (list of strings): each is a link.
    '''
    professor_list_url
    index = requests.get(professor_list_url)
    soup = BeautifulSoup(index.text, 'html.parser')
    professor_paths = []
    
    for a in soup.findAll('a'):
        if 'eval' in a['href']:
            professor_paths.append(a['href'])
      
    return professor_paths

## Test the code with one professor:  Clint Staley

In [16]:
clint = 'http://polyratings.com/stats.phtml?profid=620'
clint_df, clint_stats, clint_stats_by_grade, clint_stats_by_year = get_ratings(clint)


Professor:  620


In [17]:
clint_df

Unnamed: 0_level_0,Dept,Name,Reporting Evaluations
prof_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
620,Computer Science,Clint Staley,179


In [18]:
clint_stats

Unnamed: 0,median,mean,std_dev,mode,rating_type,prof_id
0,0,3.034,1.41,4,Overall Rating,620
1,0,3.251,1.272,4,Presents Material Clearly,620
2,0,2.872,1.446,4,Recognizes Student Difficulties,620


In [19]:
clint_stats_by_grade

Unnamed: 0,A,B,C,Credit,D,F,N/A,No Credit,prof_id
Overall Rating,3.554,2.692,2.647,,0.75,2.267,2.364,3,620
Presents Material Clearly,3.717,2.974,3.059,,1.5,2.2,2.636,4,620
Recognizes Student Difficulties,3.435,2.692,2.294,,0.25,2.0,1.909,2,620
Reporting Evaluations,92.0,39.0,17.0,,4.0,15.0,11.0,1,620


In [20]:
print(clint_stats_by_grade['B'][0], type(clint_stats_by_grade['B'][0]))
assert type(clint_stats_by_grade['B'][0]) == np.float64

2.692 <class 'numpy.float64'>


In [21]:
clint_stats_by_year

Unnamed: 0,5th Year Senior,Freshman,Graduate Student,Junior,Senior,Sophomore,prof_id
Overall Rating,3.125,3.74,2.667,2.511,2.75,2.927,620
Presents Material Clearly,3.75,3.68,3.333,2.894,3.063,3.145,620
Recognizes Student Difficulties,2.75,3.56,2.333,2.426,2.375,2.818,620
Reporting Evaluations,8.0,50.0,3.0,47.0,16.0,55.0,620


In [22]:
assert clint_stats_by_year['Freshman'][0] == 3.740

# Let's Try Two Professors!
### None values occur when Polyratings does not have data for those sections.

In [23]:
#Test first and last urls
professor_urls = get_professor_urls()[1:]
first = professor_urls[0] #first is a string
last = professor_urls[-1]
two_professors = [first, last]

In [24]:
assert first == 'http://polyratings.com/eval.phtml?profid=2073'
assert last == 'http://polyratings.com/eval.phtml?profid=728'

In [25]:
prof_dfs_list = get_all_professors_stats(two_professors)
# profs_dfs_list is a list of lists of dataframes. 

for i in prof_dfs_list: 
    print('\nAnother Professor\'s List of Data Frames: ', i[0]['Name'])
    for j in i:
        display(j)
    print('\n')


Professor:  2073

Professor:  728

Another Professor's List of Data Frames:  prof_id
2073    Christina Abel
Name: Name, dtype: object


Unnamed: 0_level_0,Dept,Name,Reporting Evaluations
prof_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2073,Chemistry and Biochemistry,Christina Abel,32


Unnamed: 0,median,mean,std_dev,mode,rating_type,prof_id
0,4,3.563,0.556,4,Overall Rating,2073
1,4,3.594,0.551,4,Presents Material Clearly,2073
2,4,3.563,0.658,4,Recognizes Student Difficulties,2073


Unnamed: 0,A,B,C,Credit,D,F,N/A,No Credit,prof_id
Overall Rating,3.769,3.222,3.75,,3,,3,,2073
Presents Material Clearly,3.769,3.222,3.75,,3,,4,,2073
Recognizes Student Difficulties,3.692,3.222,3.75,,4,,3,,2073
Reporting Evaluations,13.0,9.0,8.0,,1,,1,,2073


Unnamed: 0,5th Year Senior,Freshman,Graduate Student,Junior,Senior,Sophomore,prof_id
Overall Rating,,3.5,,4.0,3.333,3.583,2073
Presents Material Clearly,,3.714,,3.667,3.333,3.5,2073
Recognizes Student Difficulties,,3.571,,4.0,3.667,3.417,2073
Reporting Evaluations,,14.0,,3.0,3.0,12.0,2073





Another Professor's List of Data Frames:  prof_id
728    Dick Zweifel
Name: Name, dtype: object


Unnamed: 0_level_0,Dept,Name,Reporting Evaluations
prof_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
728,Architecture,Dick Zweifel,16


Unnamed: 0,median,mean,std_dev,mode,rating_type,prof_id
0,1,1.625,1.317,1,Overall Rating,728
1,2,2.375,1.111,2,Presents Material Clearly,728
2,2,1.875,1.166,1,Recognizes Student Difficulties,728


Unnamed: 0,A,B,C,Credit,D,F,N/A,No Credit,prof_id
Overall Rating,,,,1.083,,4,3.0,,728
Presents Material Clearly,,,,1.917,,4,3.667,,728
Recognizes Student Difficulties,,,,1.333,,4,3.333,,728
Reporting Evaluations,,,,12.0,,1,3.0,,728


Unnamed: 0,5th Year Senior,Freshman,Graduate Student,Junior,Senior,Sophomore,prof_id
Overall Rating,4,1.231,4,2,,,728
Presents Material Clearly,4,2.154,4,2,,,728
Recognizes Student Difficulties,4,1.538,4,2,,,728
Reporting Evaluations,1,13.0,1,1,,,728






In [26]:
this_professor = prof_dfs_list[0]
last_professor = prof_dfs_list[1]

#Test the first table
assert type(this_professor[0].index.values[0]) == np.int64
assert type(this_professor[0]['Dept'].values[0]) == str
assert type(this_professor[0]['Reporting Evaluations'].values[0]) == np.int64
assert type(this_professor[0]['Name'].values[0]) == str

#Test the second table
assert type(this_professor[1]['mode'].values[0]) == np.int64
assert type(this_professor[1]['median'].values[0]) == np.int64
assert type(this_professor[1]['mean'].values[0]) == np.float64
assert type(this_professor[1]['rating_type'].values[0]) == str
assert type(this_professor[1]['prof_id'].values[0]) == np.int64

#Test the third table
assert type(this_professor[2]['A'].values[0]) == np.float64
assert type(this_professor[2]['B'].values[0]) == np.float64
assert type(this_professor[2]['C'].values[0]) == np.float64
assert type(this_professor[2]['D'].values[0]) == np.float64
assert type(this_professor[2]['N/A'].values[0]) == np.float64
assert type(this_professor[2]['prof_id'].values[0]) == np.int64

#Test the fourth table
print(type(this_professor[3]['5th Year Senior'].values[0]))
assert type(last_professor[3]['5th Year Senior'].values[0]) == np.float64

assert type(this_professor[3]['Freshman'].values[0]) == np.float64
assert type(this_professor[3]['Sophomore'].values[0]) == np.float64
assert type(this_professor[3]['Junior'].values[0]) == np.float64
assert type(this_professor[3]['prof_id'].values[0]) == np.int64

<class 'NoneType'>


## We Want 4 Total DataFrames that Hold This Info for All Professors
## -> Merge Each Professors' DataFrames Together
## -> The Professor Ids Identify unique professors in DataFrames

In [27]:
# Concatenate dataframes
def merge_professors(profs_dfs_list):
    
    '''
    Takes a list of professors' lists of DataFrames. 
    Joins each DataFrame into 
     Args: 
         profs_dfs_list (list of lists of DataFrames)
               each item in list is a list containing 4 DataFrames
            [[first_prof_df, first_prof_stats, 
            first_prof_stats_by_letter, 
            first_prof_stats_by_grade]
            [second_prof_df, second_prof_stats, 
            second_prof_stats_by_letter, 
            second_prof_stats_by_grade]
            ...]
            
    Returns:
        
        a list of 4 DataFrames with respective info for all professors. 
            
    
    '''
    
    iterprofs = iter(profs_dfs_list)
    
    first_prof = next(iterprofs)
    if type(first_prof[0]) == pd.DataFrame:
        profs_df = first_prof[0]
        print(profs_df)
    if type(first_prof[1]) == pd.DataFrame:
        profs_stats = first_prof[1]
        print(profs_stats)
    
    profs_stats_by_grade = first_prof[2]
    print(profs_stats_by_grade)
    
    profs_stats_by_year = first_prof[3]
    print(profs_stats_by_year)
    
    #Keep track of the profs_stats_by_letters without concatenating them.
    profs_stats_by_grade_list = [first_prof[2]]
    
    #Iterate through each professor
    for professor_list in iterprofs: 
        if type(professor_list) == list:
            profs_df = pd.concat([profs_df, professor_list[0]])
            profs_stats = pd.concat([profs_stats, \
                                     professor_list[1]]).reset_index(drop=True)
            
            profs_stats_by_letter = pd.concat([profs_stats_by_grade, \
                                               professor_list[2]])
            profs_stats_by_grade = pd.concat([profs_stats_by_year, \
                                              professor_list[3]], \
                                             join='outer', axis=0)
        
            profs_stats_by_grade_list.append(professor_list[2])

    return [profs_df, profs_stats, profs_stats_by_letter, \
            profs_stats_by_grade, profs_stats_by_grade_list]


## Merged Professors' Data Frames

In [28]:
profs_df, profs_stats, profs_stats_by_grade, \
    profs_stats_by_year, profs_stats_by_grade_list = merge_professors(prof_dfs_list)

                               Dept            Name  Reporting Evaluations
prof_id                                                                   
2073     Chemistry and Biochemistry  Christina Abel                     32
   median   mean  std_dev  mode                      rating_type  prof_id
0       4  3.563    0.556     4                   Overall Rating     2073
1       4  3.594    0.551     4        Presents Material Clearly     2073
2       4  3.563    0.658     4  Recognizes Student Difficulties     2073
                                      A      B     C Credit  D     F  N/A  \
Overall Rating                    3.769  3.222  3.75   None  3  None    3   
Presents Material Clearly         3.769  3.222  3.75   None  3  None    4   
Recognizes Student Difficulties   3.692  3.222  3.75   None  4  None    3   
Reporting Evaluations            13.000  9.000  8.00   None  1  None    1   

                                No Credit  prof_id  
Overall Rating                       Non

In [29]:
profs_df

Unnamed: 0_level_0,Dept,Name,Reporting Evaluations
prof_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2073,Chemistry and Biochemistry,Christina Abel,32
728,Architecture,Dick Zweifel,16


In [30]:
profs_stats

Unnamed: 0,median,mean,std_dev,mode,rating_type,prof_id
0,4,3.563,0.556,4,Overall Rating,2073
1,4,3.594,0.551,4,Presents Material Clearly,2073
2,4,3.563,0.658,4,Recognizes Student Difficulties,2073
3,1,1.625,1.317,1,Overall Rating,728
4,2,2.375,1.111,2,Presents Material Clearly,728
5,2,1.875,1.166,1,Recognizes Student Difficulties,728


In [31]:
profs_stats_by_grade

Unnamed: 0,A,B,C,Credit,D,F,N/A,No Credit,prof_id
Overall Rating,3.769,3.222,3.75,,3.0,,3.0,,2073
Presents Material Clearly,3.769,3.222,3.75,,3.0,,4.0,,2073
Recognizes Student Difficulties,3.692,3.222,3.75,,4.0,,3.0,,2073
Reporting Evaluations,13.0,9.0,8.0,,1.0,,1.0,,2073
Overall Rating,,,,1.083,,4.0,3.0,,728
Presents Material Clearly,,,,1.917,,4.0,3.667,,728
Recognizes Student Difficulties,,,,1.333,,4.0,3.333,,728
Reporting Evaluations,,,,12.0,,1.0,3.0,,728


In [32]:
profs_stats_by_year

Unnamed: 0,5th Year Senior,Freshman,Graduate Student,Junior,Senior,Sophomore,prof_id
Overall Rating,,3.5,,4.0,3.333,3.583,2073
Presents Material Clearly,,3.714,,3.667,3.333,3.5,2073
Recognizes Student Difficulties,,3.571,,4.0,3.667,3.417,2073
Reporting Evaluations,,14.0,,3.0,3.0,12.0,2073
Overall Rating,4.0,1.231,4.0,2.0,,,728
Presents Material Clearly,4.0,2.154,4.0,2.0,,,728
Recognizes Student Difficulties,4.0,1.538,4.0,2.0,,,728
Reporting Evaluations,1.0,13.0,1.0,1.0,,,728


In [33]:
profs_stats_by_grade_list[0:2]

[                                      A      B     C Credit  D     F  N/A  \
 Overall Rating                    3.769  3.222  3.75   None  3  None    3   
 Presents Material Clearly         3.769  3.222  3.75   None  3  None    4   
 Recognizes Student Difficulties   3.692  3.222  3.75   None  4  None    3   
 Reporting Evaluations            13.000  9.000  8.00   None  1  None    1   
 
                                 No Credit  prof_id  
 Overall Rating                       None     2073  
 Presents Material Clearly            None     2073  
 Recognizes Student Difficulties      None     2073  
 Reporting Evaluations                None     2073  ,
                                     A     B     C  Credit     D  F    N/A  \
 Overall Rating                   None  None  None   1.083  None  4  3.000   
 Presents Material Clearly        None  None  None   1.917  None  4  3.667   
 Recognizes Student Difficulties  None  None  None   1.333  None  4  3.333   
 Reporting Evaluations   

----------------------------------------------------------------------------------------------

# Scrape and Parse The Professors' Individual Reviews

In [34]:
def clean_review(review):
    '''
    Takes in a review bs4.element.Tag, which is a tr tag with review information
    Puts info into dictionaries, appended into a list.
    
    Args: 
        review (bs4.element.Tag): representing a tr tag.
    
    Returns:
        dictionaries (list): list of dictionaries.
    
    '''
    
    review_dict = {}
    
    #Parse the Review Id
    review_id_tag = review.findNext('a')
    review_id = int(str(review_id_tag).strip(' --></a>').split(' ')[-1])
    review_dict['review_id'] = review_id
    
    
    #Parse the Year and Grade of the student who wrote this review
    br_tags = review.find(['br'])
    years = str(br_tags).split('<br>')[1:3]
    
    #If there is year and grade info, parse it.
    if len(years) > 0:
        years = [x.strip('\n   ') for x in years]
        review_dict['Year'] = years[0]

        grade = years[1].rstrip('</b>').split(' <b>')
        review_dict['Grade'] = grade[1]
    
    #If there is a course name, parse it.
    review_year_tags = review.findAll(['b', 'a'])
    if len(review_year_tags) > 0:
        review_dict['Course'] = \
            str(review_year_tags[1]).rstrip('</a></b>').split('>')[-1]
        
        
    #Now parse the review 
    review_text = str(review).split('</td>')
    if len(review_text) > 1: 
        review_text = review_text[-2]
        
        if review_text.strip() != '<tr><td bgcolor="#ffffff" colspan="3">':
            review_text = \
                review_text.strip().lstrip('<td align="left" valign="top" \
                                            width="79%">').strip()
                
            review_dict['Review'] = review_text
    
    df =  pd.DataFrame.from_dict(review_dict, orient='index')
    
    return df.transpose()

def get_prof_reviews(page_url):
    
    '''
        Calls cleaner functions to tidy up reviews scraped from webpages.
        
        Args: 
            page_url (string): a link to professor's polyratings page.
            
        Returns: 
            reviews_df (DataFrame) a reviews DataFrame for that professor.
    
    '''
    
    r = requests.get(page_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    prof_id = get_prof_id(page_url)
    
    table_rows = soup.findAll('tr')
    table_rows = table_rows
    
    reviews = []
    i = 0
    #clean up each row
    for row in table_rows[6:-3]:
        if i%2 == 0:
            row_df = clean_review(row)

            #if it's a review, add it to the review frame
            if len(row_df.columns) > 0: 
                reviews.append(row_df)
        i+=1 
    
    #connect review frames
    print('Professor Id: ', prof_id)
    if len(reviews) > 0:
        reviews_df = pd.concat(reviews)
        #tack on professor id.
        reviews_df['prof_id'] = prof_id
        
        return reviews_df
    else: 
        print('No reviews to concatenate.')


def get_all_profs_reviews(review_page_urls): 
    '''Helper function: 
       Slowly collects reviews on each professor on polyratings.
       Calls parser functions to clean the data.
       
       Args: 
           review_page_urls (list): links to professors' polyratings pages.
           
       Returns: 
           info from get_prof_reviews(page_url) for each professor
           reviews(list of lists of DataFrames)
               each item in list is a list containing 4 DataFrames
                    [[]
                    []
                    ...]
    '''

    professors_reviews = []
    
    #Every so often, call data scraping functions on each url.
    for url in review_page_urls:
        
        time.sleep(1.0)
        p_reviews = get_prof_reviews(url)
        if len(p_reviews) > 0:
            professors_reviews.append(p_reviews)
        
    return professors_reviews


## Let's Test Getting Reviews for Just One Professor

In [35]:
from IPython.display import display

reviews = get_prof_reviews(two_professors[0]).reset_index(drop=True)

assert type(reviews['Grade'].values[0]) == str
assert type(reviews['Course'].values[0]) == str
assert type(reviews['Year'].values[0]) == str
assert type(reviews['Review'].values[0]) == str
assert type(reviews['review_id'].values[0]) == int
assert type(reviews['prof_id'].values[0]) == np.int64

reviews.head()

Professor Id:  2073


Unnamed: 0,Review,Grade,review_id,Course,Year,prof_id
0,anytext or 1=1',B,54423,CHEM 111,Junior,2073
1,Abel is a pretty good chem teacher. Her notes...,B,35061,CHEM 125,Freshman,2073
2,"Dr. Abel, is very helpful and approachable in ...",C,34766,CHEM 125,Senior,2073
3,Talks nonstop for 3/4 of the class period and ...,B,34495,CHEM 124,Freshman,2073
4,"To do well in Abel's class, the key is practic...",A,34359,CHEM 124,Freshman,2073


## Tack on the Overall Professor Ratings by Student Reviewer's Letter Grade Earned

In [36]:
def concat_prof_by_letter(prof_stats_by_letter, my_reviews):
    
    '''
    Adds the professor's overall ratings to each row, 
    depending on the grade earned by that review.
    
    Keeping all of this code in one function eliminates 
    the alternative predicament of passing many parameters 
    to a small functions.
    
    Args: 
        prof_stats_by_letter (DataFrame): 
            Shows the overall ratings for that professor as rated by the 
            groups of students, segregated by grade earned.
        
        my_reviews (DataFrame):
            DataFrame of reviews for the professor. 
    Returns: 
        A ratings DataFrame with all reviews for a professor and overall ratings. 
    
    
    '''
    A_reviews = pd.DataFrame(my_reviews[my_reviews.Grade == 'A'])
    A_reviews['Presents Material Clearly'] = \
        prof_stats_by_letter['A']['Presents Material Clearly']
        
    A_reviews['Overall Rating'] = prof_stats_by_letter['A']['Overall Rating']
    A_reviews['Recognizes Student Difficulties'] = \
        prof_stats_by_letter['A']['Recognizes Student Difficulties']
        
    A_reviews['Reporting Evaluations'] = \
        prof_stats_by_letter['A']['Reporting Evaluations']

    B_reviews = pd.DataFrame(my_reviews[my_reviews.Grade == 'B'])
    B_reviews['Presents Material Clearly'] = \
        prof_stats_by_letter['B']['Presents Material Clearly']
    B_reviews['Overall Rating'] = prof_stats_by_letter['B']['Overall Rating']
    B_reviews['Recognizes Student Difficulties'] = \
        prof_stats_by_letter['B']['Recognizes Student Difficulties']
    B_reviews['Reporting Evaluations'] = \
        prof_stats_by_letter['B']['Reporting Evaluations']

    C_reviews = pd.DataFrame(my_reviews[my_reviews.Grade == 'C'])
    C_reviews['Presents Material Clearly'] = \
        prof_stats_by_letter['C']['Presents Material Clearly']
    C_reviews['Overall Rating'] = prof_stats_by_letter['C']['Overall Rating']
    C_reviews['Recognizes Student Difficulties'] = \
        prof_stats_by_letter['C']['Recognizes Student Difficulties']
    C_reviews['Reporting Evaluations'] = \
        prof_stats_by_letter['C']['Reporting Evaluations']

    D_reviews = pd.DataFrame(my_reviews[my_reviews.Grade == 'D'])
    D_reviews['Presents Material Clearly'] = \
        prof_stats_by_letter['D']['Presents Material Clearly']
    D_reviews['Overall Rating'] = prof_stats_by_letter['D']['Overall Rating']
    D_reviews['Recognizes Student Difficulties'] = \
        prof_stats_by_letter['D']['Recognizes Student Difficulties']
    D_reviews['Reporting Evaluations'] = \
        prof_stats_by_letter['D']['Reporting Evaluations']

    F_reviews = pd.DataFrame(my_reviews[my_reviews.Grade == 'F'])
    F_reviews['Presents Material Clearly'] = \
        prof_stats_by_letter['F']['Presents Material Clearly']
    F_reviews['Overall Rating'] = prof_stats_by_letter['F']['Overall Rating']
    F_reviews['Recognizes Student Difficulties'] = \
        prof_stats_by_letter['F']['Recognizes Student Difficulties']
    F_reviews['Reporting Evaluations'] = \
        prof_stats_by_letter['F']['Reporting Evaluations']

    NA_reviews = pd.DataFrame(my_reviews[my_reviews.Grade == 'N/A'])
    NA_reviews['Presents Material Clearly'] = \
        prof_stats_by_letter['N/A']['Presents Material Clearly']
    NA_reviews['Overall Rating'] = prof_stats_by_letter['N/A']['Overall Rating']
    NA_reviews['Recognizes Student Difficulties'] = \
        prof_stats_by_letter['N/A']['Recognizes Student Difficulties']
    NA_reviews['Reporting Evaluations'] = \
        prof_stats_by_letter['N/A']['Reporting Evaluations']

    NoCredit_reviews = pd.DataFrame(my_reviews[my_reviews.Grade == 'No Credit'])
    NoCredit_reviews['Presents Material Clearly'] = \
        prof_stats_by_letter['No Credit']['Presents Material Clearly']
    NoCredit_reviews['Overall Rating'] = \
        prof_stats_by_letter['No Credit']['Overall Rating']
    NoCredit_reviews['Recognizes Student Difficulties'] = \
        prof_stats_by_letter['No Credit']['Recognizes Student Difficulties']
    NoCredit_reviews['Reporting Evaluations'] = \
        prof_stats_by_letter['No Credit']['Reporting Evaluations']
  
    Credit_reviews = pd.DataFrame(my_reviews[my_reviews.Grade == 'Credit'])
    Credit_reviews['Presents Material Clearly'] = \
        prof_stats_by_letter['Credit']['Presents Material Clearly']
    Credit_reviews['Overall Rating'] = \
        prof_stats_by_letter['Credit']['Overall Rating']
    Credit_reviews['Recognizes Student Difficulties'] = \
        prof_stats_by_letter['Credit']['Recognizes Student Difficulties']
    Credit_reviews['Reporting Evaluations'] = \
        prof_stats_by_letter['Credit']['Reporting Evaluations']

    assert type(A_reviews) == pd.DataFrame
    assert type(B_reviews) == pd.DataFrame
    assert type(C_reviews) == pd.DataFrame
    assert type(D_reviews) == pd.DataFrame
    assert type(F_reviews) == pd.DataFrame
    assert type(Credit_reviews) == pd.DataFrame
    assert type(NA_reviews) == pd.DataFrame
    assert type(NoCredit_reviews) == pd.DataFrame

    dataframes = []
    if len(A_reviews) > 0: 
        dataframes.append(A_reviews)
    if len(A_reviews) > 0: 
        dataframes.append(B_reviews)
    if len(A_reviews) > 0: 
        dataframes.append(C_reviews)
    if len(A_reviews) > 0: 
        dataframes.append(D_reviews)
    if len(A_reviews) > 0: 
        dataframes.append(F_reviews)
    if len(A_reviews) > 0: 
        dataframes.append(NA_reviews)
    if len(A_reviews) > 0: 
        dataframes.append(Credit_reviews)
    if len(A_reviews) > 0: 
        dataframes.append(NoCredit_reviews)
    if len(dataframes) > 0:
        return pd.concat(dataframes)
    else:
        return np.nan

In [37]:
def get_reviews_by_grade(professor_reviews, profs_stats_by_letter_list):
    '''
        Gets a Reviews DataFrame with Overall Professor Ratings 
        by Letter Grade, for all Professors.
        
        Args: 
            professor_reviews (list of DataFrames): 
                Each in the list holds ratings for a professor.
                
            profs_stats_by_letter_list (list of DataFrames):
                Each in the list holds mean ratings for a professor 
                based on Letter Grade earned by reviewer.
                
        Returns: 
            A Reviews DataFrame, indexed by review_ids, 
            showing Professors Overall Ratings based on 
            Grades, Grades, Year, Course, Prof_id, etc
    '''
    
    concatenateds = []
    n = len(profs_stats_by_letter_list)
    
    if n == len(professor_reviews):
        print('n = ', n)
        for letter_df, rev_df in zip(profs_stats_by_letter_list, professor_reviews):
            return_val = concat_prof_by_letter(letter_df, rev_df)
            if return_val is not np.nan:
                concatenateds.append(return_val)
        return pd.concat(concatenateds).reset_index(drop=True).set_index('review_id')
    else: 
        print('Lengths of lists of DataFrames are not compatible.')

## Get Reviews Table For All Professors
## Set the Index to the Review_Id

In [38]:
my_reviews = get_all_profs_reviews(two_professors)
reviews_by_grade = get_reviews_by_grade(my_reviews, profs_stats_by_grade_list)
reviews_by_grade

Professor Id:  2073
Professor Id:  728
n =  2


Unnamed: 0_level_0,Review,Grade,Course,Year,prof_id,Presents Material Clearly,Overall Rating,Recognizes Student Difficulties,Reporting Evaluations
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
34359,"To do well in Abel's class, the key is practic...",A,CHEM 124,Freshman,2073,3.769,3.769,3.692,13
34176,Great professor! I learned a lot in her class....,A,CHEM 127,Freshman,2073,3.769,3.769,3.692,13
34130,good,A,CHEM 124,Senior,2073,3.769,3.769,3.692,13
34073,Foxy.,A,CHEM 125,Sophomore,2073,3.769,3.769,3.692,13
34023,Abel is an amazing teacher. I think anyone sh...,A,CHEM 124,Sophomore,2073,3.769,3.769,3.692,13
33755,Dr. Abel is a fantastic teacher. She presents ...,A,CHEM 124,Freshman,2073,3.769,3.769,3.692,13
33725,Excellent.,A,CHEM 125,Sophomore,2073,3.769,3.769,3.692,13
33448,SMOKING HOT. Great teacher too. No Hw. Just...,A,CHEM 125,Junior,2073,3.769,3.769,3.692,13
33145,"She's a great teacher, seems real nervous at f...",A,CHEM 127,Sophomore,2073,3.769,3.769,3.692,13
31992,Abel is amazing. She is really good at realizi...,A,CHEM 124,Freshman,2073,3.769,3.769,3.692,13


## Use the Index to the Review_Id to Access Reviews

In [39]:
#You can do this for each dataframe in the list my_reviews

reviews_by_grade.loc[35061]

Review                             Abel is a pretty good chem teacher.  Her notes...
Grade                                                                              B
Course                                                                      CHEM 125
Year                                                                        Freshman
prof_id                                                                         2073
Presents Material Clearly                                                      3.222
Overall Rating                                                                 3.222
Recognizes Student Difficulties                                                3.222
Reporting Evaluations                                                              9
Name: 35061, dtype: object

In [40]:
reviews_by_grade.loc[35061]['Review']

"Abel is a pretty good chem teacher.  Her notes are extremely well organized so it's really easy to go back and study before exams.  Unfortunately her class was at 7AM which may be why I found it boring.  Really nice, and super helpful.  Her tests are challenging but not impossible and you can get an A if you study for them.  DO THE WORKBOOKS, a lot of the problems and questions on her exams come straight out of the workbooks.  Also, she likes to give our a lot of supplemental handouts to go with her notes, STAPLE them to the corresponding page in your notes.  They are really helpful for your labs and quizzes but they are also really easy to lose or get mixed up."

----------------------------------------------------------------------------------------------

# Which DataFrames should we write to a JSON?
profs_df, profs_stats, profs_stats_by_grade, profs_stats_by_year, profs_stats_by_letter_list = merge_professors(prof_dfs_list)

Let's include the following:
* profs_df
    Basic Professor info (name, dept, #reviews)
* profs_stats
    Rating Averages, Standard Deviations, Median, Means per professor id.
* profs_stats_by_grade_list
    list of prof_stats_by_grade DataFrames, rather than a concatenated DataFrame
* reviews_by_grade
    profs_stats_by_grade per review id, well organized.
    
Leave out:
* profs_stats_by_year
    Overall ratings for professors based on year level of reviewer
    
* profs_stats_by_grade
    Rating Averages per reviewer's earned grade per review id

#### profs_df json

In [41]:
profs_df

Unnamed: 0_level_0,Dept,Name,Reporting Evaluations
prof_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2073,Chemistry and Biochemistry,Christina Abel,32
728,Architecture,Dick Zweifel,16


In [42]:
p_df_json = profs_df.reset_index().to_json(orient='index') #don't drop index

In [43]:
pd.read_json(p_df_json).transpose()

Unnamed: 0,Dept,Name,Reporting Evaluations,prof_id
0,Chemistry and Biochemistry,Christina Abel,32,2073
1,Architecture,Dick Zweifel,16,728


#### profs_stats json

In [44]:
profs_stats

Unnamed: 0,median,mean,std_dev,mode,rating_type,prof_id
0,4,3.563,0.556,4,Overall Rating,2073
1,4,3.594,0.551,4,Presents Material Clearly,2073
2,4,3.563,0.658,4,Recognizes Student Difficulties,2073
3,1,1.625,1.317,1,Overall Rating,728
4,2,2.375,1.111,2,Presents Material Clearly,728
5,2,1.875,1.166,1,Recognizes Student Difficulties,728


In [45]:
p_stats_json = profs_stats.reset_index(drop=True).to_json(orient='index') # drop index
p_stats_json

'{"0":{"median":4,"mean":3.563,"std_dev":0.556,"mode":4,"rating_type":"Overall Rating","prof_id":2073},"1":{"median":4,"mean":3.594,"std_dev":0.551,"mode":4,"rating_type":"Presents Material Clearly","prof_id":2073},"2":{"median":4,"mean":3.563,"std_dev":0.658,"mode":4,"rating_type":"Recognizes Student Difficulties","prof_id":2073},"3":{"median":1,"mean":1.625,"std_dev":1.317,"mode":1,"rating_type":"Overall Rating","prof_id":728},"4":{"median":2,"mean":2.375,"std_dev":1.111,"mode":2,"rating_type":"Presents Material Clearly","prof_id":728},"5":{"median":2,"mean":1.875,"std_dev":1.166,"mode":1,"rating_type":"Recognizes Student Difficulties","prof_id":728}}'

In [46]:
pd.read_json(p_stats_json).transpose()

Unnamed: 0,mean,median,mode,prof_id,rating_type,std_dev
0,3.563,4,4,2073,Overall Rating,0.556
1,3.594,4,4,2073,Presents Material Clearly,0.551
2,3.563,4,4,2073,Recognizes Student Difficulties,0.658
3,1.625,1,1,728,Overall Rating,1.317
4,2.375,2,2,728,Presents Material Clearly,1.111
5,1.875,2,1,728,Recognizes Student Difficulties,1.166


In [47]:
p_stats_by_grade_list_json = [df.to_json(orient='index') for df in profs_stats_by_grade_list]
p_stats_by_grade_list_json

['{"Overall Rating":{"A":3.769,"B":3.222,"C":3.75,"Credit":null,"D":3.0,"F":null,"N\\/A":3.0,"No Credit":null,"prof_id":2073},"Presents Material Clearly":{"A":3.769,"B":3.222,"C":3.75,"Credit":null,"D":3.0,"F":null,"N\\/A":4.0,"No Credit":null,"prof_id":2073},"Recognizes Student Difficulties":{"A":3.692,"B":3.222,"C":3.75,"Credit":null,"D":4.0,"F":null,"N\\/A":3.0,"No Credit":null,"prof_id":2073},"Reporting Evaluations":{"A":13.0,"B":9.0,"C":8.0,"Credit":null,"D":1.0,"F":null,"N\\/A":1.0,"No Credit":null,"prof_id":2073}}',
 '{"Overall Rating":{"A":null,"B":null,"C":null,"Credit":1.083,"D":null,"F":4.0,"N\\/A":3.0,"No Credit":null,"prof_id":728},"Presents Material Clearly":{"A":null,"B":null,"C":null,"Credit":1.917,"D":null,"F":4.0,"N\\/A":3.667,"No Credit":null,"prof_id":728},"Recognizes Student Difficulties":{"A":null,"B":null,"C":null,"Credit":1.333,"D":null,"F":4.0,"N\\/A":3.333,"No Credit":null,"prof_id":728},"Reporting Evaluations":{"A":null,"B":null,"C":null,"Credit":12.0,"D":nul

In [48]:
stats_by_grade = []
for df_json in p_stats_by_grade_list_json: 
    stats_by_grade.append(pd.read_json(df_json).transpose())
stats_by_grade[0].transpose()

Unnamed: 0,Overall Rating,Presents Material Clearly,Recognizes Student Difficulties,Reporting Evaluations
A,3.769,3.769,3.692,13.0
B,3.222,3.222,3.222,9.0
C,3.75,3.75,3.75,8.0
Credit,,,,
D,3.0,3.0,4.0,1.0
F,,,,
,3.0,4.0,3.0,1.0
No Credit,,,,
prof_id,2073.0,2073.0,2073.0,2073.0


In [49]:
reviews_by_grade_json = reviews_by_grade.reset_index().to_json(orient='index')

In [50]:
pd.read_json(reviews_by_grade_json).transpose().set_index('review_id').head()

Unnamed: 0_level_0,Course,Grade,Overall Rating,Presents Material Clearly,Recognizes Student Difficulties,Reporting Evaluations,Review,Year,prof_id
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
34359,CHEM 124,A,3.769,3.769,3.692,13,"To do well in Abel's class, the key is practic...",Freshman,2073
34176,CHEM 127,A,3.769,3.769,3.692,13,Great professor! I learned a lot in her class....,Freshman,2073
31660,CHEM 124,A,3.769,3.769,3.692,13,Dr. Abel is a great chem prof. I would recomme...,Freshman,2073
30100,CHEM 127,A,3.769,3.769,3.692,13,Abel's class is doable but very tedious. Her t...,Sophomore,2073
29539,CHEM 125,A,3.769,3.769,3.692,13,She is an excellent professor. She really pre...,Freshman,2073


In [51]:
def create_jsons(path, data):
    '''
    Uses json to save Data Frames to the path specified.
    
    Args: 
        path (string): where the data will be stored.
    
    Returns: 
        None.
    
    '''
    
    profs_df, profs_stats, profs_stats_by_grade_list, reviews_by_grade = data
    
    #don't drop index
    p_df_json = profs_df.reset_index().to_json(orient='index') 
    
    # drop index
    p_stats_json = profs_stats.reset_index(drop=True).to_json(orient='index') 
    
    
    p_stats_by_grade_list_json = \
        [df.to_json(orient='index') for df in profs_stats_by_grade_list]
    reviews_by_grade_json = reviews_by_grade.reset_index().to_json(orient='index')
    
    #create a json array
    json_array = \
        [p_df_json, p_stats_json, p_stats_by_grade_list_json, reviews_by_grade_json]
    
    
    with open(path, 'w') as outfile:
        json.dump(json_array, outfile)
        
create_jsons('/data/primitiveDataType/small_data.json', \
             [profs_df, profs_stats, profs_stats_by_grade_list, reviews_by_grade])

----------------------------------------------------------------------------------------------


# Let's Run the code for Every Professor!

# The code below consumes time and RAM.We must avoid overwhelming the polyratings site.In order to do this, there is a time stall between each call to scrape a professor details

Get links to all of the professors. 

In [52]:
'''Get links to all the professors.
   Call parsing methods. 
'''
# Drop the first link to get links only linking to a professor page. 
professor_urls = get_professor_urls()[1:]

Drop professor pages that are incompatible with the code.

In [53]:
good_professor_urls = [p for p in professor_urls if '=1628' not in p]
good_professor_urls = [p for p in good_professor_urls if '=359' not in p]
good_professor_urls = [p for p in good_professor_urls if '=717' not in p]
good_professor_urls = [p for p in good_professor_urls if '=468' not in p]
good_professor_urls = [p for p in good_professor_urls if '=509' not in p]

print([p for p in good_professor_urls if '=717' in p])

[]


# TIME CONSUMING CODE
The following code takes a good hour to run because it gathers all the data for each professor on polyratings.

In [54]:
#This code takes forever to run.
professor_frames = get_all_professors_stats(good_professor_urls)


Professor:  2073

Professor:  2390

Professor:  2429

Professor:  1

Professor:  2

Professor:  746

Professor:  3

Professor:  1252

Professor:  2819

Professor:  1693

Professor:  2236

Professor:  1617

Professor:  4

Professor:  1007

Professor:  5

Professor:  3351

Professor:  4126

Professor:  6

Professor:  908

Professor:  1095

Professor:  4008

Professor:  4167

Professor:  2950

Professor:  1418

Professor:  2375

Professor:  990

Professor:  1748

Professor:  974

Professor:  778

Professor:  7

Professor:  2797

Professor:  3452

Professor:  3894

Professor:  2089

Professor:  8

Professor:  9

Professor:  10

Professor:  891

Professor:  11

Professor:  2820

Professor:  1403

Professor:  2374

Professor:  3035

Professor:  4050

Professor:  12

Professor:  13

Professor:  3500

Professor:  2712

Professor:  2739

Professor:  3126

Professor:  1991

Professor:  14

Professor:  1646

Professor:  4168

Professor:  2126

Professor:  964

Professor:  4172

Professor:  965



In [57]:
assert len(professor_frames) == 2416

## Inspect the Dataframes 

In [58]:
professors, p_stats, p_stats_by_grade, p_stats_by_year, p_stats_by_grade_list = \
    merge_professors(professor_frames)

                               Dept            Name  Reporting Evaluations
prof_id                                                                   
2073     Chemistry and Biochemistry  Christina Abel                     32
   median   mean  std_dev  mode                      rating_type  prof_id
0       4  3.563    0.556     4                   Overall Rating     2073
1       4  3.594    0.551     4        Presents Material Clearly     2073
2       4  3.563    0.658     4  Recognizes Student Difficulties     2073
                                      A      B     C Credit  D     F  N/A  \
Overall Rating                    3.769  3.222  3.75   None  3  None    3   
Presents Material Clearly         3.769  3.222  3.75   None  3  None    4   
Recognizes Student Difficulties   3.692  3.222  3.75   None  4  None    3   
Reporting Evaluations            13.000  9.000  8.00   None  1  None    1   

                                No Credit  prof_id  
Overall Rating                       Non

In [59]:
professors.tail()

Unnamed: 0_level_0,Dept,Name,Reporting Evaluations
prof_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
726,Environmental Horticulture Science,Michael Zohns,8
3027,Philosophy,David Zoller,9
727,Social Sciences,Maliha Zulfacar,87
1758,Chemistry and Biochemistry,Marta Zumwalt,13
728,Architecture,Dick Zweifel,16


In [60]:
p_stats.tail(10)

Unnamed: 0,median,mean,std_dev,mode,rating_type,prof_id
5255,0,2.778,1.133,3,Recognizes Student Difficulties,3027
5256,0,1.989,1.615,4,Overall Rating,727
5257,0,2.023,1.454,4,Presents Material Clearly,727
5258,0,1.839,1.538,0,Recognizes Student Difficulties,727
5259,0,2.769,1.367,4,Overall Rating,1758
5260,0,2.923,1.385,4,Presents Material Clearly,1758
5261,0,2.462,1.278,2,Recognizes Student Difficulties,1758
5262,1,1.625,1.317,1,Overall Rating,728
5263,2,2.375,1.111,2,Presents Material Clearly,728
5264,2,1.875,1.166,1,Recognizes Student Difficulties,728


In [61]:
p_stats_by_grade_list[-3]

Unnamed: 0,A,B,C,Credit,D,F,N/A,No Credit,prof_id
Overall Rating,3.125,1.939,0.889,3.0,0.0,1.0,1.286,,727
Presents Material Clearly,2.958,2.0,1.333,2.5,0.0,1.5,1.357,,727
Recognizes Student Difficulties,3.0,1.697,1.0,2.5,0.5,1.0,1.071,,727
Reporting Evaluations,24.0,33.0,9.0,2.0,2.0,2.0,14.0,,727


In [62]:
#THis takes a long, long time to run. 
#Prints out the professor ID of the professor that it's processing.
professor_reviews = get_all_profs_reviews(good_professor_urls)

Professor Id:  2073
Professor Id:  2390
Professor Id:  2429
Professor Id:  1
Professor Id:  2
Professor Id:  746
Professor Id:  3
Professor Id:  1252
Professor Id:  2819
Professor Id:  1693
Professor Id:  2236
Professor Id:  1617
Professor Id:  4
Professor Id:  1007
Professor Id:  5
Professor Id:  3351
Professor Id:  4126
Professor Id:  6
Professor Id:  908
Professor Id:  1095
Professor Id:  4008
Professor Id:  4167
Professor Id:  2950
Professor Id:  1418
Professor Id:  2375
Professor Id:  990
Professor Id:  1748
Professor Id:  974
Professor Id:  778
Professor Id:  7
Professor Id:  2797
Professor Id:  3452
Professor Id:  3894
Professor Id:  2089
Professor Id:  8
Professor Id:  9
Professor Id:  10
Professor Id:  891
Professor Id:  11
Professor Id:  2820
Professor Id:  1403
Professor Id:  2374
Professor Id:  3035
Professor Id:  4050
Professor Id:  12
Professor Id:  13
Professor Id:  3500
Professor Id:  2712
Professor Id:  2739
Professor Id:  3126
Professor Id:  1991
Professor Id:  14
Pro

In [63]:
professor_reviews[14].head()

Unnamed: 0,Review,Grade,review_id,Course,Year,prof_id
0,Dr. Agronsky is a great professor! His class i...,,47437,MATH 142,Freshman,5
0,he does not know how to teach. after waitlisti...,F,47144,MATH 142,Freshman,5
0,Dr. Agronsky was simply amazing. I was in his ...,A,41076,MATH 241,Freshman,5
0,Agronsky is a wonderful teacher. Unfortunately...,C,40991,MATH 241,Freshman,5
0,One of the best professors here at Cal Poly. T...,,40806,MATH 241,Sophomore,5


# Surprise! Not all professors have a full statistical analysis page.
This code deletes professors that don't have full statistical analysis pages. Otherwise, we would have trouble merging DataFrames later on.

In [69]:
len(professor_reviews)
u_profs_previews = []
for p in professor_reviews:
    u_profs_previews.append(p.prof_id.unique()[0])
    
print(len(u_profs_previews))

u_profs_gl = []
for p in p_stats_by_grade_list:
    u_profs_gl.append(p.prof_id.unique()[0])

print(len(u_profs_gl))

no_full_stat_analyses = list(set(u_profs_previews) - set(u_profs_gl))
print(len(no_full_stat_analyses))
# no_full_stat_analyses[2]

2416
1755
661


We can get remove the professors that don't have full statistical analysis pages from the professor_reviews list of DataFrames. 

This will allow us to merge each professor_reviews DataFrame with a DataFrame from the p_stats_by_grade_list. 

The benefit of doing this is to organize grades and professor reviews into the same DataFrame.

The following code takes care of this.

In [85]:
def remove_bad_entries(): 
     '''
     Remove professors that don't have the full statistical analysis in professor_reviews list. 
     This is usually caused by having less than 5 reviews.
     Args: None
     Returns: removals (int): number of professors removed from the professor_reviews list. 
     '''
    removals = 0
    for index, p in enumerate(professor_reviews):
        if p.prof_id.values[0] in no_full_stat_analyses:
            removals += 1
            del professor_reviews[index]

    print('Removed ', removals, ' professors.')
    return removals

Removed  0  professors.


In [86]:
n = remove_bad_entries()
while n > 0:
    n = remove_bad_entries()

Removed  0  professors.


In [89]:
assert len(professor_reviews) == len(p_stats_by_grade_list)
assert u_profs_gl[-1] == u_profs_previews[-1]

# Below is time-consuming code
More webscraping is involved to get the text reviews for each professor.

In [88]:
reviews_by_grade = get_reviews_by_grade(professor_reviews, p_stats_by_grade_list)

n =  1755


In [90]:
reviews_by_grade.tail()

Unnamed: 0_level_0,Review,Grade,Course,Year,prof_id,Presents Material Clearly,Overall Rating,Recognizes Student Difficulties,Reporting Evaluations
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
29816,Zumwalt's class was extremely difficult for me...,C,CHEM 124,Freshman,1758,3.667,3.667,3,3
29445,Well.. 124 is a hard class but I actually real...,C,CHEM 124,Freshman,1758,3.667,3.667,3,3
31360,"Zumwalt is alright, but she isn't the greatest...",D,CHEM 124,Sophomore,1758,2.0,2.0,2,1
29647,Before I start this is not an angry spiteful e...,,CHEM 124,Freshman,1758,0.5,1.0,1,2
26349,You can tell she is a new teacher. She is ver...,,CHEM 124,Freshman,1758,0.5,1.0,1,2


## Quick! Write it all to json in the /data folder under a personalized directory. Let's have two files just in case the first one ever gets corrupted.

In [91]:
create_jsons('/data/primitiveDataType/data.json', \
             [professors, p_stats, p_stats_by_grade_list, reviews_by_grade])

In [92]:
create_jsons('/data/primitiveDataType/data1.json', \
             [professors, p_stats, p_stats_by_grade_list, reviews_by_grade])