## Load Data from CSVs

In [135]:
import unicodecsv

## Longer version of code (replaced with shorter, equivalent version below)

# enrollments = []
# f = open('enrollments.csv', 'rb')
# reader = unicodecsv.DictReader(f)
# for row in reader:
#     enrollments.append(row)
# f.close()
def read_csv(filename):
    with open(filename,'rb') as f:
        reader=unicodecsv.DictReader(f)
        return list(reader)
with open('enrollments.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    enrollments = list(reader)

In [136]:
#####################################
#                 1                 #
#####################################

## Read in the data from daily_engagement.csv and project_submissions.csv 
## and store the results in the below variables.
## Then look at the first row of each table.

daily_engagement = read_csv("./daily_engagement.csv")
project_submissions = read_csv("./project_submissions.csv")

## Fixing Data Types

In [137]:
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', datetime.datetime(2014, 11, 10, 0, 0)),
             ('cancel_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('days_to_cancel', 65),
             ('is_udacity', True),
             ('is_canceled', True)])

In [138]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
             ('num_courses_visited', 1),
             ('total_minutes_visited', 11.6793745),
             ('lessons_completed', 0),
             ('projects_completed', 0)])

In [139]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

OrderedDict([('creation_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('completion_date', datetime.datetime(2015, 1, 16, 0, 0)),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

## Investigating the Data

In [140]:
#####################################
#                 2                 #
#####################################
def unique(table,column):
    unique_acc=set()
    for row in table:
        unique_acc.add(row[column])
    return(unique_acc)
## Find the total number of rows and the number of unique students (account keys)
## in each table.
enrollment_unique=unique(enrollments,'account_key')
engage_unique=unique(daily_engagement,'acct')

In [141]:
print(len(engage_unique))
print(len(enrollment_unique))

1237
1302


## Problems in the Data

In [142]:
#####################################
#                 3                 #
#####################################
for engage in daily_engagement:
    engage['account_key']=engage['acct']
    del(engage['acct'])
daily_engagement[0]['account_key']

## Rename the "acct" column in the daily_engagement table to "account_key".

'0'

## Missing Engagement Records

In [143]:
#####################################
#                 4                 #
#####################################
enroll_proxy=enrollments
count=0
for enroll in enroll_proxy:
    student=enroll['account_key']
    
    if student not in engage_unique:
        print(enroll['account_key'])
        if enroll['days_to_cancel'] != 0:
            
            print(enroll['days_to_cancel'])
## Find any one student enrollments where the student is missing from the daily engagement table.
## Output that enrollment.

1219
871
1218
654
654
654
964
817
1171
884
875
1120
728
1191
1304
59
1304
99
1010
841
707
717
727
1273
1238
1044
749
1129
1129
926
711
803
799
789
819
819
902
996
1213
733
1237
1069
1086
1190
914
914
968
1155
766
739
1222
1125
1025
750
889
737
878
981
1101
None
1148
1186
725
1145
802
664
997
870
1284
1079
1063
1270
1291
1241


## Checking for More Problem Records

In [144]:
#####################################
#                 5                 #
#####################################

## Find the number of surprising data points (enrollments missing from
## the engagement table) that remain, if any.

## Tracking Down the Remaining Problems

In [145]:
# Create a set of the account keys for all Udacity test accounts
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)

6

In [146]:
# Given some data with an account_key field, removes any records corresponding to Udacity test accounts
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

In [147]:
# Remove Udacity test accounts from all three tables
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

print (len(non_udacity_enrollments))
print (len(non_udacity_engagement))
print (len(non_udacity_submissions))

1622
135656
3634


## Refining the Question

In [170]:
#####################################
#                 6                 #
#####################################

## Create a dictionary named paid_students containing all students who either
## haven't canceled yet or who remained enrolled for more than 7 days. The keys
## should be account keys, and the values should be the date the student enrolled.
paid_students = {}
for enrollment in non_udacity_enrollments:
    if (not enrollment['is_canceled'] or
            enrollment['days_to_cancel'] > 7):
        account_key = enrollment['account_key']
        enrollment_date = enrollment['join_date']
        if (account_key not in paid_students or
                enrollment_date > paid_students[account_key]):
            paid_students[account_key] = enrollment_date
len(paid_students)

995

## Getting Data from First Week

In [171]:
# Takes a student's join date and the date of a specific engagement record,
# and returns True if that engagement record happened within one week
# of the student joining.
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days >= 0 and time_delta.days < 7

In [172]:
#####################################
#                 7                 #
#####################################

## Create a list of rows from the engagement table including only rows where
## the student is one of the paid students you just found, and the date is within
## one week of the student's join date.

paid_engagement_in_first_week = []
count=0
for engage in daily_engagement:
    if engage['account_key'] in paid_students:
        engagement_date=engage['utc_date']
        joint_date=paid_students[engage['account_key']]
        delta=within_one_week(joint_date,engagement_date)
        if delta is True:
            paid_engagement_in_first_week.append(engage)
        
len(paid_engagement_in_first_week)  

6919

## Exploring Student Engagement

In [174]:
from collections import defaultdict

# Create a dictionary of engagement grouped by student.
# The keys are account keys, and the values are lists of engagement records.
engagement_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
    account_key = engagement_record['account_key']
    engagement_by_account[account_key].append(engagement_record)


In [177]:
# Create a dictionary with the total minutes each student spent in the classroom during the first week.
# The keys are account keys, and the values are numbers (total minutes)
total_minutes_by_account = {}
lessons_completed_by_account={}
for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
    total_minutes_by_account[account_key] = total_minutes
for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['lessons_completed']
    lessons_completed_by_account[account_key] = total_minutes

#print(total_minutes_by_account)

In [178]:
import numpy as np

# Summarize the data about minutes spent in the classroom
total_minutes = list(total_minutes_by_account.values())
max_total_min=max(total_minutes)
print(max_total_min)

print ('Mean',np.mean(total_minutes))
print ('Standard deviation:', np.std(total_minutes))
print ('Minimum:', np.min(total_minutes))
print ('Maximum:', np.max(total_minutes))

3564.7332644989997
Mean 306.708326753
Standard deviation: 412.996933409
Minimum: 0.0
Maximum: 3564.7332645


## Debugging Data Analysis Code

In [156]:
#####################################
#                 8                 #
#####################################

## Go through a similar process as before to see if there is a problem.
## Locate at least one surprising piece of data, output it, and take a look at it.

## Lessons Completed in First Week

In [179]:
#####################################
#                 9                 #
#####################################

def describe_data(data):
    print ('Mean', np.mean(data))
    print ('Standard deviation:', np.std(data))
    print ('Minimum:', np.min(data))
    print ('Maximum:', np.max(data))
describe_data(list(total_minutes_by_accounts.values()))
describe_data(list(lessons_completed_by_account.values()))
#describe_data(list(lessons_completed_by_account.values()))
## Adapt the code above to find the mean, standard deviation, minimum, and maximum for
## the number of lessons completed by each student during the first week. Try creating
## one or more functions to re-use the code above.

Mean 305174.78512
Standard deviation: 0.0
Minimum: 305174.78512
Maximum: 305174.78512
Mean 1.63618090452
Standard deviation: 3.00256129983
Minimum: 0
Maximum: 36


## Number of Visits in First Week

In [188]:
######################################
#                 10                 #
######################################
def grouped_data(data,key):
    variable=defaultdict(list)
    for data_points in data:
        account_key = data_points[key]
        variable[account_key].append(data_points)
    return variable
engage_by_acc=grouped_data(paid_engagement_in_first_week,'account_key')
#print(engage_by_acc.values())
def sum_group_items(data_p,key_p):
    variable={}
    for account_key, datas in data_p.items():
        total = 0
    
        for data_record in datas:
            total += data_record[key_p]
        variable[account_key] = total  
    return variable
tota_min=sum_group_items(engage_by_acc,'total_minutes_visited')
less_ler=sum_group_items(engage_by_acc,'lessons_completed')
print(tota_min)

## Find the mean, standard deviation, minimum, and maximum for the number of
## days each student visits the classroom during the first week.

{'0': 494.88049616599994, '1': 18.576384666670002, '2': 0.0, '3': 0.0, '4': 33.3214046667, '5': 329.7578566663, '6': 780.4545511666701, '7': 104.20388850009999, '8': 989.113641833, '9': 448.471384167, '10': 130.12347833367, '11': 0.0, '12': 179.4719088333, '13': 1013.3833969996999, '14': 65.6221875, '15': 983.375040335, '16': 119.12030049999998, '17': 235.49969150033, '18': 155.1361575, '19': 0.0, '20': 447.93897783336996, '21': 931.1036911666699, '22': 657.2052335000001, '23': 1591.3228143334, '24': 943.188117167, '25': 0.0, '26': 430.801675833, '27': 1579.12122666663, '28': 766.256315667, '29': 556.1906033333, '30': 69.6578351667, '31': 0.0, '32': 123.2915048333, '33': 253.9870258334, '34': 180.413814, '35': 765.6402170004, '36': 809.2138958339, '37': 1378.195091668, '38': 0.0, '39': 1001.5888595, '40': 478.21952616690004, '41': 511.925391, '42': 576.4643026663, '43': 88.822038, '44': 0.0, '45': 0.0, '46': 26.1189351667, '47': 854.8288881656, '48': 1055.6871896667, '49': 0.0, '50': 4

In [189]:
describe_data(list(tota_min.values()))
describe_data(list(less_ler.values()))

Mean 306.708326753
Standard deviation: 412.996933409
Minimum: 0.0
Maximum: 3564.7332645
Mean 1.63618090452
Standard deviation: 3.00256129983
Minimum: 0
Maximum: 36


In [None]:
## Find the mean, standard deviation, minimum, and maximum for the number of
## days each student visits the classroom during the first week.


## Splitting out Passing Students

In [None]:
######################################
#                 11                 #
######################################

## Create two lists of engagement data for paid students in the first week.
## The first list should contain data for students who eventually pass the
## subway project, and the second list should contain data for students
## who do not.

subway_project_lesson_keys = ['746169184', '3176718735']

passing_engagement =
non_passing_engagement =

## Comparing the Two Student Groups

In [None]:
######################################
#                 12                 #
######################################

## Compute some metrics you're interested in and see how they differ for
## students who pass the subway project vs. students who don't. A good
## starting point would be the metrics we looked at earlier (minutes spent
## in the classroom, lessons completed, and days visited).

## Making Histograms

In [None]:
######################################
#                 13                 #
######################################

## Make histograms of the three metrics we looked at earlier for both
## students who passed the subway project and students who didn't. You
## might also want to make histograms of any other metrics you examined.

## Improving Plots and Sharing Findings

In [None]:
######################################
#                 14                 #
######################################

## Make a more polished version of at least one of your visualizations
## from earlier. Try importing the seaborn library to make the visualization
## look better, adding axis labels and a title, and changing one or more
## arguments to the hist() function.