# Intro through CSV Quiz

In [1]:
import unicodecsv

with open('enrollments.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    enrollments = list(reader)

enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', '2014-11-10'),
             ('cancel_date', '2015-01-14'),
             ('days_to_cancel', '65'),
             ('is_udacity', 'True'),
             ('is_canceled', 'True')])

In [2]:
import unicodecsv

with open('daily_engagement.csv', 'rb') as d:
    engreader = unicodecsv.DictReader(d)
    daily_engagement = list(engreader)

with open('project_submissions.csv', 'rb') as p:
    subreader = unicodecsv.DictReader(p)
    project_submissions = list(subreader)

print (daily_engagement[0])
print (project_submissions[0])

OrderedDict([('acct', '0'), ('utc_date', '2015-01-09'), ('num_courses_visited', '1.0'), ('total_minutes_visited', '11.6793745'), ('lessons_completed', '0.0'), ('projects_completed', '0.0')])
OrderedDict([('creation_date', '2015-01-14'), ('completion_date', '2015-01-16'), ('assigned_rating', 'UNGRADED'), ('account_key', '256'), ('lesson_key', '3176718735'), ('processing_state', 'EVALUATED')])


In [3]:
#REad CSVs but with a function
import unicodecsv

def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')

print (enrollments[0])
print(" ")
print (daily_engagement[0])
print(" ")
print (project_submissions[0])

OrderedDict([('account_key', '448'), ('status', 'canceled'), ('join_date', '2014-11-10'), ('cancel_date', '2015-01-14'), ('days_to_cancel', '65'), ('is_udacity', 'True'), ('is_canceled', 'True')])
 
OrderedDict([('acct', '0'), ('utc_date', '2015-01-09'), ('num_courses_visited', '1.0'), ('total_minutes_visited', '11.6793745'), ('lessons_completed', '0.0'), ('projects_completed', '0.0')])
 
OrderedDict([('creation_date', '2015-01-14'), ('completion_date', '2015-01-16'), ('assigned_rating', 'UNGRADED'), ('account_key', '256'), ('lesson_key', '3176718735'), ('processing_state', 'EVALUATED')])


# Fixing Data types

### All the values appear as strings, so we need to update all data types upfront so that we dont have to mess with it later

In [4]:
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', datetime.datetime(2014, 11, 10, 0, 0)),
             ('cancel_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('days_to_cancel', 65),
             ('is_udacity', True),
             ('is_canceled', True)])

In [5]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
             ('num_courses_visited', 1),
             ('total_minutes_visited', 11.6793745),
             ('lessons_completed', 0),
             ('projects_completed', 0)])

In [6]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

OrderedDict([('creation_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('completion_date', datetime.datetime(2015, 1, 16, 0, 0)),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

## Questions I could ask this data
#### Does the number of classes coorelate to project complete rate, 
#### How many courses are actually completed, 
#### how much longer does it take someone to complete a course than approximated, 
#### does the number of courses looked at coorelate to the number complete?

# Investigate data quiz

#### find the number of rows in each file and the number of unique students in each

### Fix data problems

In [37]:
#make "account name" consistent
for engagement_record in daily_engagement:
    engagement_record['account_key'] = engagement_record.pop('acct')

In [42]:
def get_unique_students(data):
    unique_students = set()
    for data_point in data:
        unique_students.add(data_point['account_key'])
    return unique_students

In [26]:
#rows in ENROLLMENT file
row_count = sum(1 for row in enrollments) 

print("Number of rows in ENROLLMENT " + str(row_count))

Number of rows in ENROLLMENT 1640


In [43]:
#unique students in ENROLLMENT file
student_keys=get_unique_students(enrollments)

print("Unique number of students in ENROLLMENT " + str(len(student_keys)))

Unique number of students in ENROLLMENT 1302


In [31]:
#rows in DAILY ENGAGEMENT file
row_count = sum(1 for row in daily_engagement) 

print("Number of rows in DAILY ENGAGEMENT " + str(row_count))

Number of rows in DAILY ENGAGEMENT 136240


In [44]:
#unique students in DAILY ENGAGEMENT file
student_keys=get_unique_students(daily_engagement)


print("Unique number of students in DAILY ENGAGEMENT " + str(len(student_keys)))

Unique number of students in DAILY ENGAGEMENT 1237


In [33]:
#rows in PROJECT SUBMISSIONS file
row_count = sum(1 for row in project_submissions) 

print("Number of rows in PROJECT SUBMISSIONS " + str(row_count))

Number of rows in PROJECT SUBMISSIONS 3642


In [45]:
#unique students in PROJECT SUBMISSIONS file
student_keys=get_unique_students(project_submissions)


print("Unique number of students in PROJECT SUBMISSIOMS " + str(len(student_keys)))

Unique number of students in PROJECT SUBMISSIOMS 743


In [None]:
Missing engagement records