## 1.CSV file integration in the notebook

In [15]:
import unicodecsv
enrollments = []
f = open('enrollments.csv', 'rb')
reader = unicodecsv.DictReader(f)
#here, reader is not a list of dictionaries, but only an iterator
#This iterator allows you to access the elements only once.

#for loop is called to convert iterator into list of dictionaries
for row in reader:
    enrollments.append(row)

#file handle closing is required for every file
f.close()
print(enrollments[0])



OrderedDict([('account_key', '448'), ('status', 'canceled'), ('join_date', '2014-11-10'), ('cancel_date', '2015-01-14'), ('days_to_cancel', '65'), ('is_udacity', 'True'), ('is_canceled', 'True')])


In [18]:
# optimised code of the above operations
import unicodecsv
with open('enrollments.csv','rb') as f:
    reader = unicodecsv.DictReader(f)
    enrollments = list(reader)
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', '2014-11-10'),
             ('cancel_date', '2015-01-14'),
             ('days_to_cancel', '65'),
             ('is_udacity', 'True'),
             ('is_canceled', 'True')])

In [40]:
import unicodecsv

with open('daily_engagement.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    daily_engagement = list(reader)
    
with open('project_submissions.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    project_submissions = list(reader)

print(daily_engagement[0])
print(project_submissions[0])

OrderedDict([('acct', '0'), ('utc_date', '2015-01-09'), ('num_courses_visited', '1.0'), ('total_minutes_visited', '11.6793745'), ('lessons_completed', '0.0'), ('projects_completed', '0.0')])
OrderedDict([('creation_date', '2015-01-14'), ('completion_date', '2015-01-16'), ('assigned_rating', 'UNGRADED'), ('account_key', '256'), ('lesson_key', '3176718735'), ('processing_state', 'EVALUATED')])


## 2.Investigating the data

In [41]:
import unicodecsv

def reading_file(filename):
    with open (filename,'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)
    
enrollments = reading_file('enrollments.csv')
daily_engagements = reading_file('daily_engagement.csv')
project_submissions = reading_file('project_submissions.csv')

#printing total students 

print("total_enrollments : ",len(enrollments))
print("total_daily_engagements : ",len(daily_engagements))
print("total_submissions : ", len(project_submissions))
print(daily_engagements[0])

total_enrollments :  1640
total_daily_engagements :  136240
total_submissions :  3642
OrderedDict([('acct', '0'), ('utc_date', '2015-01-09'), ('num_courses_visited', '1.0'), ('total_minutes_visited', '11.6793745'), ('lessons_completed', '0.0'), ('projects_completed', '0.0')])


In [42]:
#renaming 'acct' in daily_engagements to 'account_key' 

for engagement in daily_engagements:
    engagement['account_key'] = engagement['acct']
    del(engagement['acct'])

daily_engagements[0]

OrderedDict([('utc_date', '2015-01-09'),
             ('num_courses_visited', '1.0'),
             ('total_minutes_visited', '11.6793745'),
             ('lessons_completed', '0.0'),
             ('projects_completed', '0.0'),
             ('account_key', '0')])

In [43]:
#writing a common function to count unique sets

def get_unique_students(data):
    unique_students = set()
    for data_point in data:
        unique_students.add(data_point['account_key'])
    return unique_students

In [44]:
unique_enrollments = get_unique_students(enrollments)
unique_daily_engagements = get_unique_students(daily_engagements)
unique_project_submissions = get_unique_students(project_submissions)

In [45]:
#printing number of unique students only

print('unique enrollments : ',len(unique_enrollments))
print('unique daily engagement : ', len(unique_daily_engagements))
print('unique project submissions : ', len(unique_submissions))

unique enrollments :  1302
unique daily engagement :  1237
unique project submissions :  743


### Missing engagment records

In [46]:
for enrollment in enrollments :
    student = enrollment['account_key']
    if student not in unique_daily_engagements:
        print(enrollment)

OrderedDict([('account_key', '1219'), ('status', 'canceled'), ('join_date', '2014-11-12'), ('cancel_date', '2014-11-12'), ('days_to_cancel', '0'), ('is_udacity', 'False'), ('is_canceled', 'True')])
OrderedDict([('account_key', '871'), ('status', 'canceled'), ('join_date', '2014-11-13'), ('cancel_date', '2014-11-13'), ('days_to_cancel', '0'), ('is_udacity', 'False'), ('is_canceled', 'True')])
OrderedDict([('account_key', '1218'), ('status', 'canceled'), ('join_date', '2014-11-15'), ('cancel_date', '2014-11-15'), ('days_to_cancel', '0'), ('is_udacity', 'False'), ('is_canceled', 'True')])
OrderedDict([('account_key', '654'), ('status', 'canceled'), ('join_date', '2014-12-04'), ('cancel_date', '2014-12-04'), ('days_to_cancel', '0'), ('is_udacity', 'False'), ('is_canceled', 'True')])
OrderedDict([('account_key', '654'), ('status', 'canceled'), ('join_date', '2014-12-04'), ('cancel_date', '2014-12-04'), ('days_to_cancel', '0'), ('is_udacity', 'False'), ('is_canceled', 'True')])
OrderedDict([