Before we get started, a couple of reminders to keep in mind when using iPython notebooks:

- Remember that you can see from the left side of a code cell when it was last run if there is a number within the brackets.
- When you start a new notebook session, make sure you run all of the cells up to the point where you last left off. Even if the output is still visible from when you ran the cells in your previous session, the kernel starts in a fresh state so you'll need to reload the data, etc. on a new session.
- The previous point is useful to keep in mind if your answers do not match what is expected in the lesson's quizzes. Try reloading the data and run all of the processing steps one by one in order to make sure that you are working with the same variables and data that are at each quiz stage.


## Load Data from CSVs

In [1]:
import unicodecsv

## Longer version of code (replaced w/ shorter, equivalent version below)

enrollments = [] # initiate list to hold dictionaries (each row)

# store file connection in read mode in binary format 
f = open('./enrollments.csv', 'rb') 
reader = unicodecsv.DictReader(f) # make each row a dictionary

#^^allows us to refer to each col by its name rather than its #

#for each row in our dataset, store the dictionary row into our list
for row in reader:
     enrollments.append(row)

f.close() #close to connection

#check 1st row
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', '2014-11-10'),
             ('cancel_date', '2015-01-14'),
             ('days_to_cancel', '65'),
             ('is_udacity', 'True'),
             ('is_canceled', 'True')])

In [2]:
# open file connection in read mode in binary formary
# 'with' statement helps avoid having to close file
#    - must indent everything accessing the file being opened by the WITH  
#    - file will automatically close when WITH ends

with open('enrollments.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f) #read in each row as a dictionary
    enrollments = list(reader) #create list of reader w/out using a loop

#check 1st row
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', '2014-11-10'),
             ('cancel_date', '2015-01-14'),
             ('days_to_cancel', '65'),
             ('is_udacity', 'True'),
             ('is_canceled', 'True')])

In [3]:
# Read in data from daily_engagement.csv + project_submissions.csv 

daily_engagement = []
project_submissions = []

with open('daily_engagement.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f) 
    daily_engagement = list(reader) 
    
with open('project_submissions.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f) 
    project_submissions = list(reader) 

In [4]:
daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', '2015-01-09'),
             ('num_courses_visited', '1.0'),
             ('total_minutes_visited', '11.6793745'),
             ('lessons_completed', '0.0'),
             ('projects_completed', '0.0')])

In [5]:
project_submissions[0]

OrderedDict([('creation_date', '2015-01-14'),
             ('completion_date', '2015-01-16'),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

In [6]:
#do the same via a function
def read_csv(file_name):
    
    with open(file_name, 'rb') as f:
        reader = unicodecsv.DictReader(f) 
        return list(reader)

In [7]:
enrollments = read_csv('enrollments.csv')
print(enrollments[1])

daily_engagement = read_csv('daily_engagement.csv')
print(daily_engagement[1])

project_submissions = read_csv('project_submissions.csv')
print(project_submissions[1])

OrderedDict([('account_key', '448'), ('status', 'canceled'), ('join_date', '2014-11-05'), ('cancel_date', '2014-11-10'), ('days_to_cancel', '5'), ('is_udacity', 'True'), ('is_canceled', 'True')])
OrderedDict([('acct', '0'), ('utc_date', '2015-01-10'), ('num_courses_visited', '2.0'), ('total_minutes_visited', '37.2848873333'), ('lessons_completed', '0.0'), ('projects_completed', '0.0')])
OrderedDict([('creation_date', '2015-01-10'), ('completion_date', '2015-01-13'), ('assigned_rating', 'INCOMPLETE'), ('account_key', '256'), ('lesson_key', '3176718735'), ('processing_state', 'EVALUATED')])


## Fixing Data Types

In [8]:
from datetime import datetime as dt

# Currently every value in the dict is a string bc csv library doesn't check the data type
# want to convert data types up front

# Function to take a date as a string and return a Python datetime object. 
#    - If there is no date given, return None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Function to take string (either empty or representing an int) and return an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# Clean up the data types in enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', datetime.datetime(2014, 11, 10, 0, 0)),
             ('cancel_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('days_to_cancel', 65),
             ('is_udacity', True),
             ('is_canceled', True)])

In [9]:
# Clean up data types in the engagement table
for engagement_record in daily_engagement:
    # must convert to float first b/c of the string values (converting directly to int would fail)
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
             ('num_courses_visited', 1),
             ('total_minutes_visited', 11.6793745),
             ('lessons_completed', 0),
             ('projects_completed', 0)])

In [10]:
# Clean up data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

OrderedDict([('creation_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('completion_date', datetime.datetime(2015, 1, 16, 0, 0)),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

Note when running the above cells that we are actively changing the contents of our data variables. If you try to run these cells multiple times in the same session, an error will occur.

## Questions that could be asked and answered with this data

* How long did it take students to submit projects?
* How do students who pass projects differ from those who didn't?
* How much time students spend taking classes
* How time spent taking classes relates to lessons/projects completed
* How engagement changes
* How many times students submit
* And more

## Problems in the Data

In [11]:
# Rename the "acct" column in the daily_engagement table to "account_key".
for a in daily_engagement:
    a['account_key'] = a.pop('acct')  #remove acct from each row via .pop() and add back w/ new name
    
daily_engagement[0]['account_key']

'0'

## Investigating the Data

In [12]:
# Find total number of rows and the number of unique students (account keys) in each table.

enrollment_num_rows = len(enrollments)
# use set() to get all unique values from 'account_key' key for each dictionary in enrollments list
enrollment_unique_students = len(set(a['account_key'] for a in enrollments))

daily_engagement_num_rows = len(daily_engagement)
daily_engagement_unique_students = len(set(a['account_key'] for a in daily_engagement))

project_submissions_num_rows = len(project_submissions)
project_submissions_unique_students = len(set(a['account_key'] for a in project_submissions))

In [13]:
# do same as above with a function

def get_unique_students(table):
     return set(a['account_key'] for a in table)
    
enrollment_unique_students = len(get_unique_students(enrollments))
daily_engagement_unique_students = len(get_unique_students(daily_engagement))
project_submissions_unique_students = len(get_unique_students(project_submissions))

In [14]:
print(enrollment_num_rows)
print(enrollment_unique_students)
print(daily_engagement_num_rows)
print(daily_engagement_unique_students)
print(project_submissions_num_rows)
print(project_submissions_unique_students)

1640
1302
136240
1237
3642
743


*Some students have no engagement data?*

* Should be more in engagement table, which has a row for each day a student is enrolled, even if they didn't engage
* Should've been same number of unique rows

## Missing Engagement Records

In [15]:
#####################################
#                 4                 #
#####################################

# Why are students missing from the daily engagement table?

# Find any 1 student enrollments where the student is missing from daily engagement.

for s in enrollments:         # for each student in the enrollment table
    s_key = s['account_key']  # get their account key
    if s_key not in get_unique_students(daily_engagement):  #if the current student key is NOT in unique keys of engagement
        print(s) # get that student info
        break

OrderedDict([('account_key', '1219'), ('status', 'canceled'), ('join_date', datetime.datetime(2014, 11, 12, 0, 0)), ('cancel_date', datetime.datetime(2014, 11, 12, 0, 0)), ('days_to_cancel', 0), ('is_udacity', False), ('is_canceled', True)])


See **is_cancelled** is True, **status** = 'canceled', but most importantly, **Join Date** = **Cancel Date** 

## Checking for More Problem Records

In [16]:
import time
from datetime import datetime, timedelta

enrollments[5]['cancel_date']  - enrollments[5]['join_date'] 

datetime.timedelta(13)

In [17]:
#print(float(timedelta(1).days))
type((enrollments[5]['cancel_date']  - enrollments[5]['join_date']).days)

int

In [18]:
### Find the number of surprising data points (enrollments missing from the engagement table w/ at least 1 day 
# of enrollment) that remain, if any.

missing_students = []

for s in enrollments:                                   # for each student in the enrollment table
    s_key = s['account_key']                            # get their account key
    if s_key not in get_unique_students(daily_engagement): # if student key is NOT in list of unique keys of engagement tbl
        if s['join_date'] != s['cancel_date']:             # and if they didn't cancel the same day they joined
            missing_students.append(s)

print(len(missing_students))

3


So there's 3 "missing" students.

In [19]:
# get the missing students from the enrollment table
for i in missing_students:
    print(i)

OrderedDict([('account_key', '1304'), ('status', 'canceled'), ('join_date', datetime.datetime(2015, 1, 10, 0, 0)), ('cancel_date', datetime.datetime(2015, 3, 10, 0, 0)), ('days_to_cancel', 59), ('is_udacity', True), ('is_canceled', True)])
OrderedDict([('account_key', '1304'), ('status', 'canceled'), ('join_date', datetime.datetime(2015, 3, 10, 0, 0)), ('cancel_date', datetime.datetime(2015, 6, 17, 0, 0)), ('days_to_cancel', 99), ('is_udacity', True), ('is_canceled', True)])
OrderedDict([('account_key', '1101'), ('status', 'current'), ('join_date', datetime.datetime(2015, 2, 25, 0, 0)), ('cancel_date', None), ('days_to_cancel', None), ('is_udacity', True), ('is_canceled', False)])


The "missing" students are test accounts from Udacity

## Tracking Down the Remaining Problems

In [20]:
# Create a set of the account keys for all Udacity test accounts, which causes our 3 odd records from above

# Create set to hold test accts
udacity_test_accounts = set()

# if acct is a test acc, add to the above set
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])

len(udacity_test_accounts)

6

So there's 6 test account records in our enrollment table

In [21]:
# function to remove test accts

def remove_udacity_accounts(data):
    real_data = []
    for dp in data:                                         # for each data point in the dataset provided,
        if dp['account_key'] not in udacity_test_accounts:  # if the current acct is not in the set of test accts, keep it
            real_data.append(dp)
    return real_data

In [22]:
# Remove Udacity test accounts from all 3 tables
# place into new variables in case we want to look at the old data again in the future

real_enrollments = remove_udacity_accounts(enrollments)
real_engagement = remove_udacity_accounts(daily_engagement)
real_submissions = remove_udacity_accounts(project_submissions)

print(len(real_enrollments))
print(len(real_engagement))
print(len(real_submissions))

1622
135656
3634


## Refining the Question

In [23]:
#####################################
#                 6                 #
#####################################

# Create a dictionary, paid_students, containing all students who either haven't canceled yet or 
#     who remained enrolled for more than 7 days. 
# The keys should be account keys + the values should be the date the student enrolled.

# create empty dictionary
paid_students = {}

# for each REAL student enrolled, if they have not cancelled or have been enrolled 7+ days, 
for s in real_enrollments:                                       
    if s['cancel_date'] == None or (s['cancel_date']  - s['join_date']).days > 7:
        # remove duplicate acct keys or accts w/ older join dates since students can enroll more than once
        if s['account_key'] not in paid_students or s['join_date'] > paid_students[s['account_key']]:
            paid_students[s['account_key']] = s['join_date']

print(len(paid_students))

995


In [34]:
# check 5 KV pairs from the dictionary
from itertools import islice

def take(n, iterable):
    # Return first n items of the iterable as a list
    return list(islice(iterable, n))

take(5, paid_students.items())

[('429', datetime.datetime(2015, 3, 10, 0, 0)),
 ('60', datetime.datetime(2015, 4, 1, 0, 0)),
 ('369', datetime.datetime(2014, 11, 10, 0, 0)),
 ('322', datetime.datetime(2015, 3, 10, 0, 0)),
 ('336', datetime.datetime(2014, 11, 10, 0, 0))]

## Getting Data from First Week

In [25]:
# Takes a student's join date and the date of a specific engagement record + returns True if that 
#    engagement record happened within 1 week of the student joining.

def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7                   # returns boolean if student had engagement in 1st week of enrollment

In [36]:
real_engagement[:5]

[OrderedDict([('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
              ('num_courses_visited', 1),
              ('total_minutes_visited', 11.6793745),
              ('lessons_completed', 0),
              ('projects_completed', 0),
              ('account_key', '0')]),
 OrderedDict([('utc_date', datetime.datetime(2015, 1, 10, 0, 0)),
              ('num_courses_visited', 2),
              ('total_minutes_visited', 37.2848873333),
              ('lessons_completed', 0),
              ('projects_completed', 0),
              ('account_key', '0')]),
 OrderedDict([('utc_date', datetime.datetime(2015, 1, 11, 0, 0)),
              ('num_courses_visited', 2),
              ('total_minutes_visited', 53.6337463333),
              ('lessons_completed', 0),
              ('projects_completed', 0),
              ('account_key', '0')]),
 OrderedDict([('utc_date', datetime.datetime(2015, 1, 12, 0, 0)),
              ('num_courses_visited', 1),
              ('total_minutes_visited', 33.4892

In [39]:
#####################################
#                 7                 #
#####################################

## Create a list of rows from the engagement table for students who PAID and whose engagement date is within
##     1 week of the student's join date.

def remove_free_trial_accounts(data):
    new_data = []
    for dp in data:                             # if the current student is a paid student, add to list 
        if dp['account_key'] in paid_students:
            new_data.append(dp)
    return new_data

In [45]:
paid_enrollments = remove_free_trial_accounts(real_enrollments)
paid_engagement = remove_free_trial_accounts(real_engagement)
paid_submissions = remove_free_trial_accounts(real_submissions)

print(len(paid_enrollments))
print(len(paid_engagement))
print(len(paid_submissions))

1293
134549
3618


In [48]:
paid_engagement[:2]

[OrderedDict([('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
              ('num_courses_visited', 1),
              ('total_minutes_visited', 11.6793745),
              ('lessons_completed', 0),
              ('projects_completed', 0),
              ('account_key', '0')]),
 OrderedDict([('utc_date', datetime.datetime(2015, 1, 10, 0, 0)),
              ('num_courses_visited', 2),
              ('total_minutes_visited', 37.2848873333),
              ('lessons_completed', 0),
              ('projects_completed', 0),
              ('account_key', '0')])]

In [49]:
paid_engagement_in_first_week = []

for e in paid_engagement:
    # the join date is the VALUE for the KV pair from the dictionary for a specifed Acct key
    join_date = paid_students[e['account_key']]
    engagement_date = e['utc_date']
    
    if within_one_week(join_date, engagement_date):
        paid_engagement_in_first_week.append(e)

# how many paid engagements in the 1st week
len(paid_engagement_in_first_week)

21508

## Exploring Student Engagement

* Currently have a bunch of engagement records for a set of particular students (paid students in 1st week)
* Want to seperate them into groups w/ all engagement records for particular students in each
* Can use these groups w/ a dictionary w/ keys = acct keys and values = list of engagement records
* Then we'll compute the total # of minutes spent by each student in the classroom by summing the # of minutes in each engagement record for that student
* Then we'll average this out to get average time spent by each student

In [94]:
from collections import defaultdict

# Create a dictionary of engagement grouped by student.
#    - keys = account keys, values = lists of engagement records.

# use this instead of vanilla dictionary to specify default value
#   - therefore if we try to look up a key that isn't in the dict, we get an empty list instead

account_engagement = defaultdict(list) 

for e in paid_engagement_in_first_week:                 # for each engagement, append it to the list for that acct key
    account_engagement[e['account_key']].append(e)         # look up list of engagements for an acct key + append
                                                            # if that acct key wasn't added yet, it gets a default value of 
                                                            #     an empty list from defaultdict() + then we append

In [95]:
# Create a dictionary w/ total minutes each student spent in the classroom during the 1st week.
#     keys = account keys, values = numbers (total minutes)

total_minutes_by_account = {}

for key, engagement_values in account_engagement.items():
    
    total_minutes = 0
    
    for e in engagement_values:                     # for each engagement value in the list of engagement values
        total_minutes += e['total_minutes_visited'] # add the minutes for that engagement to the total
        
    total_minutes_by_account[key] = total_minutes

In [104]:
take(5, total_minutes_by_account.values())

[494.88049616599994,
 18.576384666670002,
 0.0,
 2530.6469816678004,
 33.3214046667]

In [112]:
import numpy as np


# Summarize the data about minutes spent in the classroom

# get all minute values and put into a list
total_minutes = [] 
for e in total_minutes_by_account.values():
    total_minutes.append(e)    
#print(type(total_minutes))

print('Mean:', round(np.mean(total_minutes),2))
print('Standard deviation:',  round(np.std(total_minutes),2))
print('Minimum:',  round(np.min(total_minutes),2))
print('Maximum:',  round(np.max(total_minutes),2))

Mean: 647.59
Standard deviation: 1129.27
Minimum: 0.0
Maximum: 10568.1


See that the standard deviation is quite large, almost twice the mean, and we have an extraordinarily large value for the maximum number of minutes in that first week. It's actually *greater* then the amount of minutes in a week.

There must be something wrong

## Debugging Data Analysis Code

In [138]:
#####################################
#                 8                 #
#####################################

### Find the acct key of the surprising data point (student w/ max engagement value) 

incorrect_time = []
max_minutes = int(np.max(total_minutes))

for k,v in total_minutes_by_account.items():
    if v > max_minutes:
        incorrect_time.append(k)

#print(len(incorrect_time))
incorrect_time

['108']

In [144]:
## For the above acct key, get all engagement records

for e in paid_engagement_in_first_week:
    if e['account_key'] == incorrect_time[0]:
        print(e)

OrderedDict([('utc_date', datetime.datetime(2015, 1, 7, 0, 0)), ('num_courses_visited', 1), ('total_minutes_visited', 50.9938951667), ('lessons_completed', 0), ('projects_completed', 0), ('account_key', '108')])
OrderedDict([('utc_date', datetime.datetime(2015, 1, 8, 0, 0)), ('num_courses_visited', 2), ('total_minutes_visited', 688.3034385), ('lessons_completed', 5), ('projects_completed', 0), ('account_key', '108')])
OrderedDict([('utc_date', datetime.datetime(2015, 1, 9, 0, 0)), ('num_courses_visited', 2), ('total_minutes_visited', 427.691757667), ('lessons_completed', 1), ('projects_completed', 0), ('account_key', '108')])
OrderedDict([('utc_date', datetime.datetime(2015, 1, 10, 0, 0)), ('num_courses_visited', 3), ('total_minutes_visited', 165.6270925), ('lessons_completed', 0), ('projects_completed', 0), ('account_key', '108')])
OrderedDict([('utc_date', datetime.datetime(2015, 1, 11, 0, 0)), ('num_courses_visited', 0), ('total_minutes_visited', 0.0), ('lessons_completed', 0), ('pr

We're getting way more than 7 entries, which is wrong because we're only looking at the engagement for the *1st week* of enrollment.

So something must be wrong with the **within_one_week** function.

It turns out that we did not check that the engagement date did indeed come *after* the join date

In [154]:
# Takes a student's join date and the date of a specific engagement record + returns True if that 
#    engagement record happened within 1 week of the student joining and engagement date is after join date

def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 and time_delta.days >= 0   

# remove free trial account 
def remove_free_trial_accounts(data):
    new_data = []
    for dp in data:                             # if the current student is a paid student, add to list 
        if dp['account_key'] in paid_students:
            new_data.append(dp)
    return new_data

# get only paid engagements
paid_enrollments = remove_free_trial_accounts(real_enrollments)
paid_engagement = remove_free_trial_accounts(real_engagement)
paid_submissions = remove_free_trial_accounts(real_submissions)

# get only paid engagements from 1st wrek
paid_engagement_in_first_week = []

for e in paid_engagement:
    join_date = paid_students[e['account_key']]
    engagement_date = e['utc_date']
    
    if within_one_week(join_date, engagement_date):
        paid_engagement_in_first_week.append(e)

# create dictionary of each engagement record per student (key = student, value = list of engagements)
account_engagement = defaultdict(list) 

for e in paid_engagement_in_first_week:                 
    account_engagement[e['account_key']].append(e)
    
# get total minutes by student
total_minutes_by_account = {}

for key, engagement_values in account_engagement.items():
    
    total_minutes = 0
    
    for e in engagement_values:                     # for each engagement value in the list of engagement values
        total_minutes += e['total_minutes_visited'] # add the minutes for that engagement to the total
        
    total_minutes_by_account[key] = total_minutes

# get new statistics
total_minutes = [] 
for e in total_minutes_by_account.values():
    total_minutes.append(e)    
#print(type(total_minutes))

print('Mean:', round(np.mean(total_minutes),2))
print('Standard deviation:',  round(np.std(total_minutes),2))
print('Minimum:',  round(np.min(total_minutes),2))
print('Maximum:',  round(np.max(total_minutes),2))

Mean: 306.71
Standard deviation: 413.0
Minimum: 0.0
Maximum: 3564.73


So Standard deviation is still a bit more than the mean, and the max amount of engagement is about 60 hours, but let's check the days of engagement again.

In [155]:
incorrect_time = []
max_minutes = int(np.max(total_minutes))

for k,v in total_minutes_by_account.items():
    if v > max_minutes:
        incorrect_time.append(k)
        
for e in paid_engagement_in_first_week:
    if e['account_key'] == incorrect_time[0]:
        print(e)

OrderedDict([('utc_date', datetime.datetime(2015, 7, 9, 0, 0)), ('num_courses_visited', 4), ('total_minutes_visited', 850.519339666), ('lessons_completed', 4), ('projects_completed', 0), ('account_key', '163')])
OrderedDict([('utc_date', datetime.datetime(2015, 7, 10, 0, 0)), ('num_courses_visited', 6), ('total_minutes_visited', 872.633923334), ('lessons_completed', 6), ('projects_completed', 0), ('account_key', '163')])
OrderedDict([('utc_date', datetime.datetime(2015, 7, 11, 0, 0)), ('num_courses_visited', 2), ('total_minutes_visited', 777.018903666), ('lessons_completed', 6), ('projects_completed', 0), ('account_key', '163')])
OrderedDict([('utc_date', datetime.datetime(2015, 7, 12, 0, 0)), ('num_courses_visited', 1), ('total_minutes_visited', 294.568774), ('lessons_completed', 2), ('projects_completed', 0), ('account_key', '163')])
OrderedDict([('utc_date', datetime.datetime(2015, 7, 13, 0, 0)), ('num_courses_visited', 3), ('total_minutes_visited', 471.2139785), ('lessons_completed

So we get 7 days of engagement. Perfect.

## Lessons Completed in First Week

In [165]:
#####################################
#                 9                 #
#####################################

## Adapt the code above to find the mean, standard deviation, minimum, and maximum for
## the number of lessons completed by each student during the first week. Try creating
## one or more functions to re-use the code above.

paid_engagement_in_first_week[:1]

[OrderedDict([('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
              ('num_courses_visited', 1),
              ('total_minutes_visited', 11.6793745),
              ('lessons_completed', 0),
              ('projects_completed', 0),
              ('account_key', '0')])]

In [180]:
number_lessons_by_account = {}

for key, engagement_values in account_engagement.items():
    
    total_lessons = 0
    
    for e in engagement_values:                     # for each engagement value in the list of engagement values
        total_lessons += e['lessons_completed'] # add the minutes for that engagement to the total
        
    number_lessons_by_account[key] = total_lessons

total_lessons = [] 
for e in number_lessons_by_account.values():
    total_lessons.append(e)    

print('Mean:', round(np.mean(total_lessons),2))
print('Standard deviation:',  round(np.std(total_lessons),2))
print('Minimum:',  round(np.min(total_lessons),2))
print('Maximum:',  round(np.max(total_lessons),2))

Mean: 1.64
Standard deviation: 3.0
Minimum: 0
Maximum: 36


## Now do the same but with functions

In [202]:
def group_data(data, key):
    
    grouped_data = defaultdict(list) 

    for dp in data:                
        grouped_data[dp[key]].append(dp) 
    
    return grouped_data

#***************************************************************************

def find_totals(data,field):

    finalList = {}
    
    for k,v in data.items():
        total = 0
        for e in v:
            total += e[field]
        finalList[k] = total
    
    return finalList
        
#***************************************************************************    
    
def get_stats(dic):
    total = [] 
    
    for e in dic.values():
            total.append(e)    

    print('Mean:', round(np.mean(total),2))
    print('Standard deviation:',  round(np.std(total),2))
    print('Minimum:',  round(np.min(total),2))
    print('Maximum:',  round(np.max(total),2))

In [201]:
lessons_by_account = group_data(paid_engagement_in_first_week,'account_key')
total_lessons_by_account = find_totals(lessons_by_account,'lessons_completed')
get_stats(total_lessons_by_account)

Mean: 1.64
Standard deviation: 3.0
Minimum: 0
Maximum: 36


In [203]:
lessons_by_account = group_data(paid_engagement_in_first_week,'account_key')
total_lessons_by_account = find_totals(lessons_by_account,'projects_completed')
get_stats(total_lessons_by_account)

Mean: 0.01
Standard deviation: 0.09
Minimum: 0
Maximum: 2


## Number of Visits in First Week

In [223]:
######################################
#                 10                 #
######################################

## Find stats for  number of days each student visits the classroom during the first week.

# create copy of data for safety/backup purposes
paid_engagement_in_first_week_test = paid_engagement_in_first_week.copy()

# for each record (day), if the current record has a course visited, mark it as true
for e in range(0,len(paid_engagement_in_first_week_test)):
    paid_engagement_in_first_week_test[e]['has_visited'] = paid_engagement_in_first_week_test[e]['num_courses_visited'] > 0

# inspect
paid_engagement_in_first_week_test

[OrderedDict([('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
              ('num_courses_visited', 1),
              ('total_minutes_visited', 11.6793745),
              ('lessons_completed', 0),
              ('projects_completed', 0),
              ('account_key', '0'),
              ('has_visited', True)]),
 OrderedDict([('utc_date', datetime.datetime(2015, 1, 10, 0, 0)),
              ('num_courses_visited', 2),
              ('total_minutes_visited', 37.2848873333),
              ('lessons_completed', 0),
              ('projects_completed', 0),
              ('account_key', '0'),
              ('has_visited', True)]),
 OrderedDict([('utc_date', datetime.datetime(2015, 1, 11, 0, 0)),
              ('num_courses_visited', 2),
              ('total_minutes_visited', 53.6337463333),
              ('lessons_completed', 0),
              ('projects_completed', 0),
              ('account_key', '0'),
              ('has_visited', True)]),
 OrderedDict([('utc_date', datetime.datetime

In [225]:
# get the stats for days visited in the 1st week
engagement_by_account = group_data(paid_engagement_in_first_week,'account_key')
total_days_by_account = find_totals(engagement_by_account,'has_visited')
get_stats(total_days_by_account)

Mean: 2.87
Standard deviation: 2.26
Minimum: 0
Maximum: 7


## Splitting out Passing Students

In [None]:
######################################
#                 11                 #
######################################

## Create two lists of engagement data for paid students in the first week.
## The first list should contain data for students who eventually pass the
## subway project, and the second list should contain data for students
## who do not.

subway_project_lesson_keys = ['746169184', '3176718735']

passing_engagement =
non_passing_engagement =

## Comparing the Two Student Groups

In [None]:
######################################
#                 12                 #
######################################

## Compute some metrics you're interested in and see how they differ for
## students who pass the subway project vs. students who don't. A good
## starting point would be the metrics we looked at earlier (minutes spent
## in the classroom, lessons completed, and days visited).

## Making Histograms

In [None]:
######################################
#                 13                 #
######################################

## Make histograms of the three metrics we looked at earlier for both
## students who passed the subway project and students who didn't. You
## might also want to make histograms of any other metrics you examined.

## Improving Plots and Sharing Findings

In [None]:
######################################
#                 14                 #
######################################

## Make a more polished version of at least one of your visualizations
## from earlier. Try importing the seaborn library to make the visualization
## look better, adding axis labels and a title, and changing one or more
## arguments to the hist() function.