### 1. Acquiring Data

In [1]:
# Steps to analyzing data
# 1. Get the data
# 2. Review the data
# 3. Ask questions based on what you found (get as creative as you need to)
# 4. Clean your data

In [3]:
import unicodecsv as csv
from statistics import mean

In [3]:
# long way
# enrollment_data = []
# file = open('enrollments.csv', 'rb')
# data = csv.DictReader(file)
# for i in data:
#     enrollment_data.append(i)
# file.close()

In [4]:
# shorter way
# with open('enrollments.csv', 'rb') as file:
#     data = csv.DictReader(file)
#     enrollment_data = list(data)

In [4]:
# best way
def read_from_csv(filename):
    with open(filename, 'rb') as file:
        data = csv.DictReader(file)
        return list(data)

In [5]:
enrollment_data = read_from_csv('enrollments.csv')
engagement_data = read_from_csv('engagements.csv')
submission_data = read_from_csv('submissions.csv')

### 2. Cleaning Data

In [6]:
from datetime import datetime as dt

def convert_to_date(date_string):
    if date_string == '':
        return None
    else:
        return dt.strptime(date_string, '%Y-%m-%d')
    
def convert_to_int(int_string):
    if int_string == '':
        return None
    else:
        return int(int_string)
    
def convert_to_float(float_string):
    if float_string == '':
        return None
    else:
        return float(float_string)
    
def convert_to_bool(bool_string):
    if bool_string == '':
        return None
    elif bool_string == 'TRUE':
        return True
    elif bool_string == 'FALSE':
        return False

##### Fix data types

In [7]:
for i in enrollment_data:
    i['join_date'] = convert_to_date(i['join_date'])
    i['cancel_date'] = convert_to_date(i['cancel_date'])
    i['days_to_cancel'] = convert_to_int(i['days_to_cancel'])
    i['is_enrolled'] = convert_to_bool(i['is_enrolled'])
    i['is_canceled'] = convert_to_bool(i['is_canceled'])

In [8]:
enrollment_data[0]

{'account_key': '448',
 'status': 'canceled',
 'join_date': datetime.datetime(2017, 11, 10, 0, 0),
 'cancel_date': datetime.datetime(2018, 1, 14, 0, 0),
 'days_to_cancel': 65,
 'is_enrolled': True,
 'is_canceled': True}

In [9]:
for i in engagement_data:
    i['utc_date'] = convert_to_date(i['utc_date'])
    i['num_courses_visited'] = convert_to_int(i['num_courses_visited'])
    i['total_minutes_visited'] = convert_to_float(i['total_minutes_visited'])
    i['lessons_completed'] = convert_to_float(i['lessons_completed'])
    i['projects_completed'] = convert_to_float(i['projects_completed'])

In [10]:
engagement_data[0]

{'acct': '0',
 'utc_date': datetime.datetime(2018, 1, 9, 0, 0),
 'num_courses_visited': 1,
 'total_minutes_visited': 11.6793745,
 'lessons_completed': 0.0,
 'projects_completed': 0.0}

In [11]:
for i in submission_data:
    i['creation_date'] = convert_to_date(i['creation_date'])
    i['completion_date'] = convert_to_date(i['completion_date'])

In [12]:
submission_data[0]

{'creation_date': datetime.datetime(2018, 1, 14, 0, 0),
 'completion_date': datetime.datetime(2018, 1, 16, 0, 0),
 'assigned_rating': 'UNGRADED',
 'account_key': '256',
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

In [13]:
for i in engagement_data:
    i['account_key'] = i['acct']
    del i['acct']

In [14]:
for i in enrollment_data:
    i['is_student'] = i['is_enrolled']
    del i['is_enrolled']

In [15]:
engagement_data[0]

{'utc_date': datetime.datetime(2018, 1, 9, 0, 0),
 'num_courses_visited': 1,
 'total_minutes_visited': 11.6793745,
 'lessons_completed': 0.0,
 'projects_completed': 0.0,
 'account_key': '0'}

In [16]:
print(f"Enrollment records: {len(enrollment_data)}")
print(f"Engagement records: {len(engagement_data)}")
print(f"Submissions records: {len(submission_data)}")

def get_unique_records(data, column_name):
    unique_data = set()
    for i in data:
        unique_data.add(i[column_name])
    return unique_data


print()

unique_enrollment_data = get_unique_records(enrollment_data, 'account_key')
unique_engagement_data = get_unique_records(engagement_data, 'account_key')
unique_submission_data = get_unique_records(submission_data, 'account_key')

print(f"Unique enrollment records: {len(unique_enrollment_data)}")
print(f"Unique engagement records: {len(unique_engagement_data)}")
print(f"Unique submissions records: {len(unique_submission_data)}")

Enrollment records: 1640
Engagement records: 136240
Submissions records: 3642

Unique enrollment records: 1302
Unique engagement records: 1237
Unique submissions records: 743


In [17]:
outliers = 0
for i in enrollment_data:
    if i['account_key'] not in unique_engagement_data and i['join_date'] != i['cancel_date']:
        outliers +=1
        print(i, end='\n\n')
print(outliers)

{'account_key': '1304', 'status': 'canceled', 'join_date': datetime.datetime(2018, 1, 10, 0, 0), 'cancel_date': datetime.datetime(2018, 3, 10, 0, 0), 'days_to_cancel': 59, 'is_canceled': True, 'is_student': True}

{'account_key': '1304', 'status': 'canceled', 'join_date': datetime.datetime(2018, 3, 10, 0, 0), 'cancel_date': datetime.datetime(2018, 6, 17, 0, 0), 'days_to_cancel': 99, 'is_canceled': True, 'is_student': True}

{'account_key': '1101', 'status': 'current', 'join_date': datetime.datetime(2018, 2, 25, 0, 0), 'cancel_date': None, 'days_to_cancel': None, 'is_canceled': False, 'is_student': True}

3


In [18]:
# Check for possible test accounts.
# is_student shows True for questionable looking accounts, but shows False for legit accounts
test_accounts = set()
for i in enrollment_data:
    if i['is_student']:
        test_accounts.add(i['account_key'])
print(f"{len(test_accounts)} possible test accounts were found.")

6 possible test accounts were found.


In [19]:
def remove_test_accounts(data):
    new_records = []
    for i in data:
        if i['account_key'] in test_accounts:
            pass
        else:
            new_records.append(i)
    return new_records

In [20]:
true_enrollments = remove_test_accounts(enrollment_data)
true_engagements = remove_test_accounts(engagement_data)
true_submissions = remove_test_accounts(submission_data)

print(f"True enrollment records: {len(true_enrollments)}")
print(f"True engagement records: {len(true_engagements)}")
print(f"True submissions records: {len(true_submissions)}")

True enrollment records: 1622
True engagement records: 135656
True submissions records: 3634


In [21]:
# What was the average number of days to cancel?
days_to_cancel = mean([i['days_to_cancel'] for i in enrollment_data if i['days_to_cancel'] is not None])
print(f"It took students about {round(days_to_cancel, 1)} days to cancel their subscription.")

It took students about 43.9 days to cancel their subscription.


#### Find all quality accounts where:
##### 1. 'is_canceled' is False
##### 2. 'days_to_cancel' is greater than 7
##### 3. retrieve one record with most recent 'join_date', but have account keys that show up multiple times

### 4. Finalizing/Summarizing Data

### 3. Answering Questions w/ Data