In [1]:
import numpy as np
import pandas as pd
import os
import ast
import nltk
import functools
import re
import json
import statistics
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display

In [30]:
WORD_COUNTS_NAME = 'word_counts'
WORD_COUNTS_LIST_NAME = 'word_counts_list'
TOTAL_NAME = 'total_words'
COMMENTS_COLUMN = 'comments'
FREQUENCY_NAME = 'frequency'
MAGNIFY_LIST = ['too', 'very', 'that', 'so', 'as']
NEGATIVE_LIST = ['not', 'aren\'t', 'isn\'t', 'wasn\'t', 'didn\'t']
with open('constants/word_categories.json', 'r') as f:
    WORD_CATEGORIES = json.load(f)
IMPORTANT_WORDS = reduce(lambda x, y: x + y, WORD_CATEGORIES.values())

In [3]:
# HIGH MEMORY: Don't load unless you have to
with open('data/overall_list.json', 'r') as f:
    ALL_WORKLOAD_LIST = json.load(f)
    
# TODO: Finish this for other metrics
# with open('overall_list.json', 'r') as f:
#     ALL_RECOMMEND_LIST = json.load(f)

In [5]:
'''Fixes the fact that some zeros are encoded as empty list'''
def fix_list(x):
    try:
        return int(x)
    except:
        return 0
    
df = pd.read_csv('raw_data/final.csv')
df.enrollment = df.enrollment.apply(fix_list)

In [19]:
'''Takes a body of text and returns a list of words in that text.'''
def get_words(text):
    return re.compile('\w+').findall(text)

'''See if a word was used in a negative context'''
def probe_if_negative(word1, word2):
    if word1 in NEGATIVE_LIST:
        return True
    elif (word2 in NEGATIVE_LIST) and (word1 in MAGNIFY_LIST):
        return True
    else:
        return False

def get_word_dict(comments, to_return):
    d = {}
    t = 0
    for comment in comments:
        last_word = ''
        two_words_ago = ''
        for word in get_words(comment):
            word = word.lower() # Make lower case
            t += 1
            try:
                if probe_if_negative(last_word, two_words_ago):
                    d['not_' + word] += 1
                else:
                    d[word] += 1
            except: 
                if probe_if_negative(last_word, two_words_ago):
                    d['not_' + word] = 1
                else:
                    d[word] = 1
            two_words_ago = last_word
            last_word = word
    if to_return == 'words':
        return d
    elif to_return == 'count':
        return t

def preprocessing(to_return, row):
    raw_comments = row[COMMENTS_COLUMN]
    return get_word_dict(ast.literal_eval(raw_comments), to_return)

def count_words(in_df):
    in_df = in_df.copy()
    in_df[TOTAL_NAME] = in_df.apply(functools.partial(preprocessing, 'count'), axis=1)
    in_df[WORD_COUNTS_NAME] = in_df.apply(functools.partial(preprocessing, 'words'), axis=1)
    return in_df

'''Calculates the frequency of a word in the comments of row'''
def calculate_frequency(word, row):
    try:
        return float(row[WORD_COUNTS_NAME][word]) / float(row[TOTAL_NAME])
    except:
        return 0
    
'''Weighted mean'''
def weighted_mean(x, w):
    return np.sum(x * w) / np.sum(w)

'''Weighted covariance'''
def weighted_cov(x, y, w):
    return np.sum(w * (x - weighted_mean(x, w)) * (y - weighted_mean(y, w))) / np.sum(w)

'''Weighted correlation'''
def corr(x, y, w):
    return weighted_cov(x, y, w) / np.sqrt(weighted_cov(x, x, w) * weighted_cov(y, y, w))

''' Returns two column dataframe consisting of df[column] and the frequency of the word.
    Weights according to the number of comments. '''
def find_word_correlations(in_df, word, column):
    freq = in_df.apply(functools.partial(calculate_frequency, word), axis=1)
    possibly_nan = pd.concat([freq, in_df[column], in_df[COMMENTS_COLUMN]], 
                             axis=1, 
                             keys=[FREQUENCY_NAME, column, COMMENTS_COLUMN])
    clean_df = possibly_nan.dropna(axis=0, how='any')
    return corr(clean_df[FREQUENCY_NAME], 
                clean_df[column],
                clean_df[COMMENTS_COLUMN].apply(lambda x: len(ast.literal_eval(x))))

In [20]:
get_words("There was a man who wasn't dead")

['There', 'was', 'a', 'man', 'who', 'wasn', 't', 'dead']

In [22]:
def sum_dict_values(some_dict):
    return sum(some_dict.values())

'''Finds overall frequency of words in df by constructing total count and total dict
    in_df: The input df which expects to ??
    add_method: if counter, ??? if list, ???
'''
def find_word_freqs_over_df(in_df, agg_method):
    in_df = in_df.copy()
    if agg_method == 'counter':
        D = Counter({})
        for index, row in in_df.iterrows():
            D += Counter(row[WORD_COUNTS_NAME])
        return dict(D)
    elif agg_method == 'list':
        D = {}
        keys = set({})
        for index, row in in_df.iterrows():
            r = row[WORD_COUNTS_LIST_NAME]
            keys = keys.union(set(r.keys()))
            for key in keys:
                try:
                    Dval = D[key]
                except:
                    Dval = []
                try:
                    rval = r[key]
                except:
                    rval = []
                D[key] = Dval + rval
        return D
    
'''Breaks df into subdfs, then finds frequencies for subdfs.'''
def find_group_word_freqs(whole_df, gb):
    whole_df = whole_df.copy()
    gb = whole_df.groupby(gb)    
    return [(x, find_word_freqs_over_df(gb.get_group(x), "counter")) for x in gb.groups]

def create_group_name(tuple_or_string):
    if type(tuple_or_string) == tuple:
        return tuple_or_string[1] + '_' + str(tuple_or_string[0])
    else:
        return tuple_or_string

'''Truncates to most frequent 50 words in each dict.  Puts dicts in '''
def package_word_freqs(freqs_list_tuples):
    output = {}
    for category_name, freqs_list in freqs_list_tuples:
        for grouping, freqs in freqs_list:
            name = create_group_name(grouping)
            freq_list = straighten_list(freqs, 50)
            try:
                output[category_name][name] = freq_list
            except:
                output[category_name] = {}
                output[category_name][name] = freq_list
    return output

'''Turns dict k, v set into list and truncates to '''
def straighten_list(in_dict, truncate_value):
    return sorted(in_dict.items(), key=lambda word_count: -word_count[1])[:truncate_value]

'''Turns a dict of occurence count form into a dict of target_value_list form'''
def convert_dict(in_dict, value):
    in_dict = dict(in_dict) # Make a copy so nothing bad happens
    new_dict = {}
    for k, v in in_dict.iteritems():
        new_dict[k] = [value] * v
    return new_dict

'''Applies convert dict to each row of the in_df with values according to column'''
def apply_list_conversion(in_df, column):
    in_df = in_df.copy() # Copy so nothing bad happens
    in_df[WORD_COUNTS_LIST_NAME] = (in_df[[WORD_COUNTS_NAME, column]]
                                    .apply(lambda x: convert_dict(x[WORD_COUNTS_NAME], x[column]), axis=1))
    return in_df

'''Finds a specific group of words.  
    aggregate_list: Given some rating, this is the list for every word's data.  
    word_list: the specific words cared about.  '''
def find_specific_word_values(aggregate_list, word_list):
    D = {}
    for word in word_list:
        D[word] = [x for x in aggregate_list[word] if not np.isnan(x)]
    return D

In [9]:
df = pd.read_csv("raw_data/final.csv")
df2 = count_words(df)

In [14]:
df3 = apply_list_conversion(df2, 'Course_Workload_Rating')
df4 = apply_list_conversion(df2, 'Course_Overall_Rating')

In [12]:
# Not implemented in visualization
by_department = find_group_word_freqs(df2, 'department1')
by_year = find_group_word_freqs(df2, ['year', 'term'])
cdf_data = package_word_freqs([('department', by_department), ('year', by_year)])

KeyboardInterrupt: 

In [88]:
with open('cdf_data.json', 'w') as outfile:
        json.dump(cdf_data, outfile)

with open('data/all_workload_list.json', 'w') as outfile:
        json.dump(ALL_WORKLOAD_LIST, outfile)

### Getting Data for Visualization of Frequency Plots

In [17]:
# Will get a dictionary featuring the counts of every word.
overall_workload = find_word_freqs_over_df(df3, 'counter')
overall_rating = find_word_freqs_over_df(df4, 'counter')

In [25]:
# Will get a dictionary featuring 
'''e.g. find_word_freqs_over_df(df3, 'list') = {
    word1: [workload_use1, workload_use2, ...],
    word2: [workload_use1], 
    ...
}'''
occurrences_workload = find_word_freqs_over_df(df3, 'list')
occurrences_rating = find_word_freqs_over_df(df4, 'list')

In [26]:
occurrences_rating

{'biennials': [4.8],
 'homomorphism': [3.8],
 'schlegel': [4.6],
 'ginzburg': [3.4],
 'woods': [4.3, 4.9, 4.8, 5.0, 4.3, 4.3, 4.1, 4.0, 4.5, 3.5, 4.8],
 'spiders': [4.8],
 'hanging': [3.9,
  4.4,
  4.2,
  4.2,
  4.8,
  3.8,
  3.5,
  4.0,
  4.4,
  4.7,
  4.9,
  4.5,
  4.3,
  4.3,
  4.0,
  4.7,
  4.6,
  4.3,
  4.9],
 'ultimatley': [4.6],
 'francesca': [4.1, 4.0, 4.5, 4.5, 4.9],
 'comically': [3.3, 3.5, 3.4],
 'suzane': [4.2],
 'brockton': [4.5],
 'erfectly': [4.1],
 'originality': [4.0, 3.7, 4.3, 3.5, 4.4, 4.7, 3.6, 4.1, 3.0, 3.5, 4.2, 4.0],
 'unnecessarily': [3.8,
  3.8,
  3.2,
  3.8,
  3.9,
  4.3,
  3.8,
  4.1,
  3.6,
  4.0,
  4.0,
  4.0,
  4.0,
  4.1,
  3.3,
  3.3,
  3.6,
  3.2,
  4.7,
  3.9,
  4.3,
  4.6,
  3.4,
  3.4,
  2.9,
  2.9,
  4.2,
  3.4,
  3.6,
  3.8,
  3.4,
  3.0,
  4.4,
  3.5,
  3.5,
  4.1,
  3.6,
  3.7,
  2.0,
  4.1,
  4.0,
  3.7,
  3.7,
  3.6,
  3.2,
  4.0,
  4.0,
  3.7,
  4.3,
  3.9,
  4.4,
  3.7,
  3.6,
  3.3,
  2.6,
  3.2,
  3.5,
  2.9,
  4.4,
  4.2,
  3.5,
  3.5,
  3

In [32]:
vis_workload = find_specific_word_values(occurrences_workload, IMPORTANT_WORDS)
vis_rating = find_specific_word_values(occurrences_rating, IMPORTANT_WORDS)
with open('data/short_workload_list.json', 'w') as outfile:
    json.dump(vis_workload, outfile)
with open('data/short_overall_list.json', 'w') as outfile:
    json.dump(vis_rating, outfile)

### Preparing data for Enrollment Data

In [22]:
# USEFUL FUNCTIONS FOR THIS PART
def filter_nans(val):
    if val == [] or val == '[]':
        return 0
    else:
        return float(val)

In [23]:
df = pd.read_csv('raw_data/final.csv')

In [97]:
of_interest = df[['department1', 'year', 'term', 
                  'enrollment', 'name_key1', 'course_title', 'Course_Workload_Rating', 'Course_Overall_Rating']]
of_interest.loc[:, 'enrollment'] = (of_interest['enrollment']
                                     .apply(lambda x: filter_nans(x)))

In [98]:
top_depts = (of_interest.groupby('department1')['enrollment'].sum())
top_depts_list = top_depts.astype('int').sort_values().index[-30:].tolist()

In [99]:
of_more_interest = (of_interest[of_interest['department1']
                                .isin(top_depts_list)])

In [100]:
of_more_interest.to_csv('enrollment.csv')

In [101]:
of_more_interest.head()

Unnamed: 0,department1,year,term,enrollment,name_key1,course_title,Course_Workload_Rating,Course_Overall_Rating
0,AESTHINT,2011,fall,59.0,AESTHINT 13,Cultural Agents,1.5,3.7
1,AESTHINT,2011,fall,48.0,AESTHINT 15,Elements of Rhetoric,2.3,4.2
2,AESTHINT,2011,fall,111.0,AESTHINT 24,First Nights: Five Performance Premieres,1.5,3.9
3,AESTHINT,2011,fall,99.0,AESTHINT 26,"Race, Gender, and Performance",1.6,3.8
4,AESTHINT,2011,fall,29.0,AESTHINT 30,Love In A Dead Language: Classical Indian Lite...,2.0,4.3


In [102]:
# Enrollment by semester
# [{year: 2011_spring, MATH: 10, ...}]
of_more_interest['year_term'] = of_more_interest['year'].astype(str)+ '_' + of_more_interest['term']
info = of_more_interest.groupby(['year_term', 'department1'])['enrollment'].sum().to_frame().reset_index()
enrollment_history = []
for time in info.year_term.unique():
    d = {}
    d['year'] = time
    for _idx, row in info[info.year_term == time].iterrows():
        d[row['department1']] = row['enrollment']
    enrollment_history = enrollment_history + [d]
with open('data/enrollment_history.json', 'w') as outfile:
        json.dump(enrollment_history, outfile)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### For Computing Median Student

In [107]:
def compute_one_median(vals):
    cutoff = sum(vals) / 2
    start = 0
    lastval = vals[0]
    for val in vals:
        start += val
        if start >= cutoff:
            return (val + lastval) / 2

def compute_median_student_dict(df):
    d = {}
    for department in df['department1'].unique():
        gf = df[df['department1'] == department]
        vals = gf.enrollment.tolist()
        d[department] = compute_one_median(vals)
    return d

def compute_median_dict(df):
    d = {}
    for department in df['department1'].unique():
        gf = df[df['department1'] == department]
        vals = gf.enrollment.tolist()
        d[department] = statistics.median(vals)
    return d

In [124]:
median_student = compute_median_student_dict(of_more_interest)
median = compute_median_dict(of_more_interest)
with open("data/median_student.json", 'w') as outfile:
    json.dump(median_student, outfile)
with open("data/median.json", 'w') as outfile:
    json.dump(median, outfile)

### For Computing Correlations between workload and rating.

In [119]:
def compute_correlation_dict(df):
    d = {}
    df = df.dropna(0)
    for department in df['department1'].unique():
        gf = df[df['department1'] == department]
        w = np.array(gf.enrollment)
        x = np.array(gf.Course_Workload_Rating.map(float))
        y = np.array(gf.Course_Overall_Rating.map(float))
        d[department] = corr(x,y,w)
    return d

In [123]:
correlations = compute_correlation_dict(of_more_interest)
with open("data/correlations.json", 'w') as outfile:
    json.dump(correlations, outfile)

### For difficulty aggregate

In [156]:
relevant_cols = ['Course_Workload_Distribution', 
                 'Course_Overall_Distribution',
                 'Course_Overall_Respondents',
                 'Course_Workload_Rating',
                 'Course_Overall_Rating',
                 'Course_Workload_Respondents',
                 'enrollment', 
                 'name_key1',
                 'course_title', 
                 'year',
                 'term',
                 'department1']

In [157]:
CONVENTIONS = {
    'workload': {
        'distribution': 'Course_Workload_Distribution',
        'mean': 'Course_Workload_Rating',
        'intervals': [{'min': 0, 'max': 3},
                      {'min': 4, 'max': 7},
                      {'min': 8, 'max': 11},
                      {'min': 11, 'max': 14},
                      {'min': 11, 'max': 17}],
        'std_multiplier': 3,
        'respondents': 'Course_Workload_Respondents'
    },
    'overall': {
        'distribution': 'Course_Overall_Distribution',
        'mean': 'Course_Overall_Rating',
        'intervals': [{'min': 1, 'max': 1},
                      {'min': 2, 'max': 2},
                      {'min': 3, 'max': 3},
                      {'min': 4, 'max': 4},
                      {'min': 5, 'max': 5}],
        'std_multiplier': 1,
        'respondents': 'Course_Overall_Respondents'
    }
}

In [210]:
def get_ith_list_val(i, lst):
    if type(lst) == type(np.nan):
        return 0
    else:
        lst = ast.literal_eval(lst)
        if len(lst) == 0:
            return 0
        else:
            return lst[i]
        
def get_list_stat_nice(lst, func, std_multiplier):
    big_list = []
    for i in range(len(lst)):
        big_list += ([i] * lst[i])
    if len(big_list) == 1 and func == statistics.stdev: # Need 2+ data pts for variance
        return -1
    if func == statistics.stdev:
        return func(big_list) * std_multiplier
    else:
        return func(big_list)

def get_list_stat(lst, func, std_multiplier):
    if type(lst) == type(np.nan):
        return -1
    else:
        lst = ast.literal_eval(lst)
        if len(lst) == 0 or reduce(lambda x, y: x+y, lst) == 0:
            return -1
        else:
            return get_list_stat_nice(lst, func, std_multiplier)

def get_stat_score(statistic, desired, lst):
    if desired == 'median':
        intervals = CONVENTIONS[statistic]['intervals']
        m = get_list_stat(lst, statistics.median, 1)
        if m == -1:
            return np.nan
        if float(m).is_integer():
            i = int(m)
            return 0.5 * (intervals[i]['max'] + intervals[i]['min'])
        else:
            m = int(m-0.5)
            return 0.5 * (intervals[m]['max'] + intervals[m + 1]['min'])
    elif desired == 'stdev':
        std_multiplier = CONVENTIONS[statistic]['std_multiplier']
        return get_list_stat(lst, statistics.stdev, std_multiplier)

def add_distribution_data(in_df, statistics):
    in_df = in_df.copy()
    for statistic in statistics:
        column = CONVENTIONS[statistic]['distribution']
        for i in range(5):
            in_df[statistic + '_s' + str(i+1)] = (in_df[column]
                .apply(functools.partial(get_ith_list_val, i)))
    return in_df

def add_mean_data(in_df, statistics):
    in_df = in_df.copy()
    for statistic in statistics:
        in_df[statistic + '_mean'] = in_df[CONVENTIONS[statistic]['mean']]
    return in_df

def add_respondents_data(in_df, statistics):
    in_df = in_df.copy()
    for statistic in statistics:
        in_df[statistic + '_respondents'] = in_df[CONVENTIONS[statistic]['respondents']]
    return in_df

def add_instdev_data(in_df, statistics):
    in_df = in_df.copy()
    for statistic in statistics:
        column = CONVENTIONS[statistic]['distribution']
        in_df[statistic + '_instdev'] = (in_df[column]
                                        .apply(functools.partial(get_stat_score, statistic, 'stdev')))
    return in_df


def add_additional_data(in_df, statistics):
    in_df = in_df.copy()
    in_df = add_respondents_data(in_df, statistics)
    in_df = add_distribution_data(in_df, statistics)
    in_df = add_mean_data(in_df, statistics)
    in_df = add_instdev_data(in_df, statistics)
    return in_df

'''Computes weighted mean'''
def compute_grouped_mean(in_df, val_col, weight_col):
    in_df = in_df.copy()
    return weighted_mean(x=in_df[val_col], w=in_df[weight_col])

'''Computes the inter-group std (std of means weighted according to num respondants)'''
def compute_grouped_outstd(in_df, mean_col, weight_col):
    in_df = in_df.copy()
    return np.sqrt(weighted_cov(w=in_df[weight_col], x=in_df[mean_col], y=in_df[mean_col]))

def add_grouped_data(in_df, gb, statistics):
    in_df = in_df.copy()
    distribution_columns = []
    for statistic in statistics:
        for num in [1, 2, 3, 4, 5]:
            distribution_columns += [statistic + '_s' + str(num)]
    gp_df = in_df.groupby(gb)[distribution_columns].sum()
    for statistic in statistics:
        gp_df = (gp_df.join(in_df.groupby(gb)[[statistic + '_mean', statistic + '_respondents']]
                           .apply(compute_grouped_mean, statistic + '_mean', statistic + '_respondents')
                           .to_frame(statistic + '_mean'))
                 .join(in_df.groupby(gb)[[statistic + '_instdev', statistic + '_respondents']]
                       .apply(compute_grouped_mean, statistic + '_instdev', statistic + '_respondents')
                       .to_frame(statistic + '_instdev'))
                 .join(in_df.groupby(gb)[[statistic + '_mean', statistic + '_respondents']]
                       .apply(compute_grouped_outstd, statistic + '_mean', statistic + '_respondents')
                       .to_frame(statistic + '_outstdev')))
    return gp_df

def make_all_stats_for_year(in_df, year):
    in_df = in_df.copy()
    relevant_df = in_df[in_df.year == year][RELEVANT_COLS]
    basic_stats_df = add_additional_data(relevant_df, ['workload', 'overall'])
    grouped_stats_df = add_grouped_data(basic_stats_df, 'department1', ['workload', 'overall'])
    grouped_stats_df = grouped_stats_df[grouped_stats_df.index.isin(top_depts_list)]
    return grouped_stats_df

In [211]:
basic_stats_df = add_additional_data(relevant_df, ['workload', 'overall'])
grouped_stats_df = add_grouped_data(basic_stats_df, 'department1', ['workload', 'overall'])
grouped_stats_df = grouped_stats_df[grouped_stats_df.index.isin(top_depts_list)]

In [214]:
grouped_stats_df.sort_values('overall_outstdev')

Unnamed: 0_level_0,workload_s1,workload_s2,workload_s3,workload_s4,workload_s5,overall_s1,overall_s2,overall_s3,overall_s4,overall_s5,workload_mean,workload_instdev,workload_outstdev,overall_mean,overall_instdev,overall_outstdev
department1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
SPANSH,1332,4042,999,112,50,51,207,972,2541,2987,3.191905,1.944804,1.579686,4.214117,0.795844,0.282742
ETHRSON,2055,3438,588,46,30,102,408,1379,2583,2024,2.847669,1.888124,1.22474,3.925062,0.903916,0.306533
CULTBLF,3270,4356,666,62,23,102,456,1722,3472,3096,2.468545,1.837499,1.053884,4.015269,0.863717,0.313039
PHYSCI,195,2167,1739,362,153,167,593,1511,1641,825,4.810962,2.368908,2.268167,3.500401,0.973573,0.333174
SOCWORLD,1606,3413,849,112,29,110,358,1245,2379,2204,2.816991,1.951843,1.156378,3.992122,0.895333,0.356579
SCILIVSY,1864,2815,523,56,29,138,378,1300,1980,1690,2.673917,2.025728,1.134484,3.850583,0.941988,0.371696
STAT,1302,5254,3712,956,822,231,847,2782,4712,4199,4.811647,2.269072,3.149624,3.927641,0.900646,0.383429
EXPOS,1015,6102,2275,311,134,134,582,1800,3756,3873,3.725099,2.041996,1.676108,4.04966,0.864011,0.390902
ENGLISH,852,4593,2105,373,150,74,189,926,2343,4896,3.75589,1.983221,1.989326,4.400688,0.701565,0.391545
AESTHINT,1521,2573,428,40,17,107,353,1020,1726,1655,2.628063,1.870728,1.098672,3.922341,0.928985,0.401641


## For Listing Within Department

This will allow the sorting of particular classes.

In [68]:
ranked = df[['term', 'year', 'name_key1', 'department1', 'enrollment', 'course_title', 'Course_Workload_Rating', 'Course_Overall_Rating']]
ranked['Course'] = ranked["name_key1"].map(str) + ': ' + ranked["course_title"]
ranked = ranked.drop(['name_key1', 'course_title'], 1)
ranked = ranked.rename(columns={'Course_Workload_Rating': 'Workload',
                                'Course_Overall_Rating': 'Overall',
                                'enrollment': 'Enrollment',
                                'department1': 'department'}).set_index('Course')
ranked.to_csv('data/ranked.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [69]:
ranked

Unnamed: 0_level_0,term,year,department,Enrollment,Workload,Overall
Course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AESTHINT 13: Cultural Agents,fall,2011,AESTHINT,59,1.5,3.7
AESTHINT 15: Elements of Rhetoric,fall,2011,AESTHINT,48,2.3,4.2
AESTHINT 24: First Nights: Five Performance Premieres,fall,2011,AESTHINT,111,1.5,3.9
"AESTHINT 26: Race, Gender, and Performance",fall,2011,AESTHINT,99,1.6,3.8
AESTHINT 30: Love In A Dead Language: Classical Indian Literature and Its Theorists,fall,2011,AESTHINT,29,2.0,4.3
AESTHINT 33: Ancient Fictions: The Ancient Novel in Context,fall,2011,AESTHINT,81,1.7,4.1
AESTHINT 35: Forms in Korean Cultural History,fall,2011,AESTHINT,9,1.6,3.9
AESTHINT 37: Introduction to the Bible in the Humanities and the Arts,fall,2011,AESTHINT,182,1.8,3.1
AESTHINT 38: The English Language as Literature,fall,2011,AESTHINT,16,1.6,4.1
AESTHINT 40: Monuments of Islamic Architecture,fall,2011,AESTHINT,12,1.8,3.8
