# 4 Years at College Expressed in Data

In [None]:
import datetime

import numpy as np
import pandas as pd
from ics import Calendar

import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from plotly import tools
import cufflinks as cf

from lib.custom_utils import generate_wordcloud, get_semester_date_range, get_semester_asana, get_semester_via_col

init_notebook_mode(connected=True)
cf.set_config_file(world_readable=True, offline=True)

## Constants

In [None]:
all_color = 'rgba(219, 64, 82, 1.0)'

semester_codes = ['f16', 's17', 'f17', 's18', 'f18', 's19', 'f19', 's20']

semester_colors = {
    'f16': 'rgba(76, 175, 95, 1.0)',
    's17': 'rgba(0, 150, 143, 1.0)',
    'f17': 'rgba(0, 188, 223 1.0)',
    's18': 'rgba(3, 169, 255, 1.0)',
    'f18': 'rgba(33, 150, 255, 1.0)',
    's19': 'rgba(63, 81, 181, 1.0)',
    'f19': 'rgba(103, 58, 183, 1.0)',
    's20': 'rgba(156, 39, 176, 1.0)',
}

semester_names = {
    'f16': 'Fall 16',
    's17': 'Spring 17',
    'f17': 'Fall 17',
    's18': 'Spring 18',
    'f18': 'Fall 18',
    's19': 'Spring 19',
    'f19': 'Fall 19',
    's20': 'Spring 20',
}

## Load Data

In [None]:
old_df = pd.read_csv('data/School.csv', parse_dates=[1, 2, 3, 7, 8])

In [None]:
f18_df = pd.read_csv('data/asana-umass-f18.csv', parse_dates=[1, 2, 3, 8, 9])
#s19_df = pd.read_csv('data/asana-umass-s19.csv', parse_dates=[1, 2, 3, 8, 9])
#f19_df = pd.read_csv('data/asana-umass-f19.csv', parse_dates=[1, 2, 3, 8, 9])
#s20_df = pd.read_csv('data/asana-umass-s20.csv', parse_dates=[1, 2, 3, 8, 9])

In [None]:
all_df = pd.concat([old_df, f18_df], verify_integrity=True, ignore_index=True, sort=True)
all_df.head()

## Task Creation Day of Week

In [None]:
all_df['Created At DOW'] = all_df['Created At'].dt.dayofweek

In [None]:
data = []
for sem in semester_codes:
    data.append(
        go.Bar(
            x=get_semester_asana(all_df, sem)['Created At DOW'].value_counts(normalize=True).keys(),
            y=get_semester_asana(all_df, sem)['Created At DOW'].value_counts(normalize=True).values,
            name=semester_names[sem],
            marker={ 'color': semester_colors[sem] }
        )
    )

layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='DOW Comparison')

## Busiest Class per Semester

Note: Data only availible for last 4 semesters because the way I separated tasks before didn't label the class.

In [None]:
# https://community.plot.ly/t/setting-up-pie-charts-subplots-with-an-appropriate-size-and-spacing/5066
domains = [
    {'x': [0, .48], 'y': [.51, 1]}, #cell (1,1)
    {'x': [.52, 1], 'y': [.51, 1]}, #cell (1,2)
    {'x': [0, .48], 'y': [0, .49]}, #cell (2,1)
    {'x': [.52, 1], 'y': [0, .49]}  #cell (2,2)
]

fig = {
    'data': [],
    "layout": {
        "title":"Busiest Class per Semester",
        "annotations": [],
        'autosize': False,
        'height': 850,
        'width': 900
    }
}

for i, sem in enumerate(semester_codes[4:]):
    fig['data'].append(
        {
            "values": get_semester_asana(all_df, sem)['Column'].value_counts().values,
            "labels": get_semester_asana(all_df, sem)['Column'].value_counts().keys(),
            'domain': domains[i],
            "name": semester_names[sem],
            "hoverinfo":"label+percent+name",
            "hole": .4,
            "type": "pie"
        }
    )
    
    fig['layout']['annotations'].append(
        {
            "font": {
                "size": 15
            },
            "showarrow": False,
            "text": semester_names[sem],
            "x": 0.82 if i % 2 != 0 else 0.20,
            "y": 0.23 if i >= 2 else 0.78
        }
    )

iplot(fig, filename='donut')

## Completion Time

In [None]:
all_df['Duration'] = (all_df['Completed At'] - all_df['Created At'])

In [None]:
data = []
for sem in semester_codes:
    data.append(
        go.Bar(
            x=get_semester_asana(all_df, sem)[(get_semester_asana(all_df, sem)['Duration'].astype('timedelta64[D]') < 30)]['Duration'].value_counts(normalize=True).keys().days,
            y=get_semester_asana(all_df, sem)[(get_semester_asana(all_df, sem)['Duration'].astype('timedelta64[D]') < 30)]['Duration'].value_counts(normalize=True).values,
            name=semester_names[sem],
            marker={ 'color': semester_colors[sem] }
        )
    )

layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')

In [None]:
# concatenate all name fields from tasks separated by duration of 3 days
less_text = ' '.join(list(all_df[all_df['Duration'].astype('timedelta64[D]') < 3]['Name'].dropna()))
grtr_text = ' '.join(list(all_df[all_df['Duration'].astype('timedelta64[D]') >= 3]['Name'].dropna()))

less_wordcloud = generate_wordcloud(less_text)
grtr_wordcloud = generate_wordcloud(grtr_text)

# display wordclouds using matplotlib
f, axes = plt.subplots(1, 2, sharex=True)
f.set_size_inches(18, 10)
axes[0].imshow(less_wordcloud, interpolation="bilinear")
axes[0].set_title('<3 days', fontsize=36)
axes[0].axis("off")
axes[1].imshow(grtr_wordcloud, interpolation="bilinear")
axes[1].set_title('>=3 days', fontsize=36)
axes[1].axis("off")

## Due Date Frequency

In [None]:
data = []
for i, sem in enumerate(semester_codes):
    data.append(
        go.Bar(
            x=get_semester_asana(all_df, sem)['Due Date'].dropna().value_counts().keys(),
            y=get_semester_asana(all_df, sem)['Due Date'].dropna().value_counts().values,
            name=semester_names[sem],
            marker={ 'color': semester_colors[sem] },
            yaxis='y' + str(i+1)
        )
    )

fig = tools.make_subplots(rows=4, cols=2, subplot_titles=list(semester_names.values()))

fig.append_trace(data[0], 1, 1)
fig.append_trace(data[1], 1, 2)
fig.append_trace(data[2], 2, 1)
fig.append_trace(data[3], 2, 2)
fig.append_trace(data[4], 3, 1)
fig.append_trace(data[5], 3, 2)
fig.append_trace(data[6], 4, 1)
fig.append_trace(data[7], 4, 2)

for i, sem in enumerate(semester_codes):
    fig['layout']['xaxis' + str(i+1)].update(range=get_semester_date_range(sem, unix_time=True))
    fig['layout']['yaxis' + str(i+1)].update(range=[0, 10])

fig.layout.update(height=1000)
fig.layout.update(title='Due Date Frequency')

iplot(fig, filename='due date freq')

## Overdue Tasks

In [None]:
all_df['Overdue'] = all_df['Completed At'] - all_df['Due Date']

In [None]:
data = []
for sem in semester_codes:
    data.append(
        go.Bar(
            x=get_semester_asana(all_df, sem)['Overdue'].value_counts(normalize=True).keys().days,
            y=get_semester_asana(all_df, sem)['Overdue'].value_counts(normalize=True).values,
            name=semester_names[sem],
            marker={ 'color': semester_colors[sem] }
        )
    )

layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='grouped-bar')

In [None]:
data = []
for i, sem in enumerate(semester_codes):
    data.append(
        go.Bar(
            x=get_semester_asana(all_df, sem)['Overdue'].value_counts(normalize=True).keys().days,
            y=get_semester_asana(all_df, sem)['Overdue'].value_counts(normalize=True).values,
            name=semester_names[sem],
            marker={ 'color': semester_colors[sem] },
            yaxis='y' + str(i+1)
        )
    )

fig = tools.make_subplots(rows=4, cols=2, subplot_titles=list(semester_names.values()))

fig.append_trace(data[0], 1, 1)
fig.append_trace(data[1], 1, 2)
fig.append_trace(data[2], 2, 1)
fig.append_trace(data[3], 2, 2)
fig.append_trace(data[4], 3, 1)
fig.append_trace(data[5], 3, 2)
fig.append_trace(data[6], 4, 1)
fig.append_trace(data[7], 4, 2)

for i, sem in enumerate(semester_codes):
    fig['layout']['xaxis' + str(i+1)].update(range=[-28, 28])
    fig['layout']['yaxis' + str(i+1)].update(range=[0, 0.7])

fig.layout.update(height=1000)
fig.layout.update(title='Overdue Spread')

iplot(fig, filename='overdue spread')

In [None]:
# concatenate all name fields from overdue tasks
before_text = ' '.join(list(all_df[all_df['Overdue'].astype('timedelta64[D]') < 0]['Name'].dropna()))
sameday_text = ' '.join(list(all_df[all_df['Overdue'].astype('timedelta64[D]') == 0]['Name'].dropna()))
overdue_text = ' '.join(list(all_df[all_df['Overdue'].astype('timedelta64[D]') > 0]['Name'].dropna()))

before_wordcloud = generate_wordcloud(before_text)
sameday_wordcloud = generate_wordcloud(sameday_text)
overdue_wordcloud = generate_wordcloud(overdue_text)

# display wordclouds using matplotlib
f, axes = plt.subplots(2, 2, sharex=True)
f.set_size_inches(18, 10)
axes[0, 0].imshow(before_wordcloud, interpolation="bilinear")
axes[0, 0].set_title('Completed Before', fontsize=36)
axes[0, 0].axis("off")
axes[0, 1].imshow(sameday_wordcloud, interpolation="bilinear")
axes[0, 1].set_title('Completed Same Day', fontsize=36)
axes[0, 1].axis("off")
axes[1, 0].imshow(overdue_wordcloud, interpolation="bilinear")
axes[1, 0].set_title('Overdue', fontsize=36)
axes[1, 0].axis("off")
axes[1, 1].axis("off")

# Exams

In [None]:
with open('data/Exams_2alvmakoou6sa9ks0roaq79nic@group.calendar.google.com.ics', 'r') as f:
    exams_cal = Calendar(f.readlines())

In [None]:
exam_counts = {}
for exam in exams_cal.events:
    exam_date_str = exam.begin.strftime('%Y-%m-%d')
    exam_counts[exam_date_str] = exam_counts.get(exam_date_str, 0) + 1
    
exam_counts = pd.DataFrame.from_dict({ 'date': list(exam_counts.keys()), 'num': list(exam_counts.values()) })
exam_counts['date']  = pd.to_datetime(exam_counts['date'])

In [None]:
data = []
for i, sem in enumerate(semester_codes):
    data.append(
        go.Bar(
            x=get_semester_via_col(exam_counts, 'date', sem)['date'],
            y=get_semester_via_col(exam_counts, 'date', sem)['num'],
            name=semester_names[sem],
            marker={ 'color': semester_colors[sem] },
            yaxis='y' + str(i+1)
        )
    )

fig = tools.make_subplots(rows=4, cols=2, subplot_titles=list(semester_names.values()))

fig.append_trace(data[0], 1, 1)
fig.append_trace(data[1], 1, 2)
fig.append_trace(data[2], 2, 1)
fig.append_trace(data[3], 2, 2)
fig.append_trace(data[4], 3, 1)
fig.append_trace(data[5], 3, 2)
fig.append_trace(data[6], 4, 1)
fig.append_trace(data[7], 4, 2)
                                                                               
for i, sem in enumerate(semester_codes):
    fig['layout']['xaxis' + str(i+1)].update(range=get_semester_date_range(sem, unix_time=True))
    fig['layout']['yaxis' + str(i+1)].update(range=[0, 3])

fig.layout.update(height=1000)
fig.layout.update(title='Exams')

iplot(fig, filename='due date freq')