In [2]:
import sqlite3
import pandas as pd
import numpy as np
import json
import datetime

In [6]:
#Connecting to SQLite3 Database
con = sqlite3.connect('dev/cademycode.db')
cur = con.cursor()

#determing table names
table_list = [a for a in cur.execute("SELECT name FROM sqlite_master WHERE type = 'table'")]
print(table_list)

OperationalError: unable to open database file

In [12]:
#reading tables as dataframes
cade_students = pd.read_sql_query('SELECT * FROM cademycode_students', con)
cade_courses = pd.read_sql_query('SELECT * FROM cademycode_courses', con)
cade_student_jobs = pd.read_sql_query('SELECT * FROM cademycode_student_jobs', con)

DatabaseError: Execution failed on sql 'SELECT * FROM cademycode_students': no such table: cademycode_students

In [None]:
print('cade_students', len(cade_students))
print('cade_courses', len(cade_students))
print('cade_student_jobs', len(cade_students))

In [None]:
#working with student table
cade_students.head(5)

In [None]:
#examining columns and checking for any null values
cade_students.info()
#contact info is a dictionary will need to explode into seperate columns
#each row has a UUID whcih means one student can only be one row
#none of the numerical columns are coming in as floats or integers
#missing data in job_id, num_course_taken, current_career path_id and time_spent_hours

In [None]:
def extract_contact_info(contact_info):
    try:
        info = json.loads(contact_info.replace("'", "'"))
        return pd.Series([info.get('mailing_address'), info.get('email')])
    except json.JSONDecodeError:
        return pd.Series([None, None])

In [None]:
cade_students[['mailing_address', 'email']] = cade_students['contact_info'].apply(extract_contact_info)

In [None]:
cade_students.drop(columns=['contact_info'], inplace=True)

In [None]:
cade_students.head(10)

In [None]:
#splitting mailing_address to address, city and state
split_mailing_address = cade_students.mailing_address.str.split(',', expand=True)
split_mailing_address.columns = ['address', 'city', 'state', 'zip_code']
cade_students = pd.concat([cade_students.drop('mailing_address', axis=1), split_mailing_address], axis=1)

In [None]:
#converting datatypes for dob, job_id, num_course_taken, current_career_path, time_spent_hrs
cade_students['job_id'] = cade_students['job_id'].astype(float)
cade_students['num_course_taken'] = cade_students['num_course_taken'].astype(float)
cade_students['current_career_path_id'] = cade_students['current_career_path_id'].astype(float)
cade_students['time_spent_hrs'] = cade_students['time_spent_hrs'].astype(float)

#checking if changes are correct
cade_students.info()

In [None]:
#handling missing data for num_course_take 
missing_courses_taken = cade_students[cade_students[['num_course_taken']].isnull().any(axis=1)]
display(missing_courses_taken)

In [None]:
#will store the missing data in a seperate table for future use
missing_data = pd.DataFrame()
missing_data = pd.concat([missing_data, missing_courses_taken])
cade_students = cade_students.dropna(subset=['num_course_taken'])

In [None]:
cade_students.info()

In [None]:
missing_job = cade_students[cade_students[['job_id']].isnull().any(axis=1)]
display(missing_job)

In [None]:
missing_data = pd.concat([missing_data, missing_job])
cade_students = cade_students.dropna(subset=['job_id'])

In [None]:
#current_career_path missing data
missing_career = cade_students[cade_students[['current_career_path_id']].isnull().any(axis=1)]
display(missing_career)

In [None]:
missing_career.info()

In [None]:
#creating new id to incicate no career and set its time spent to zero
cade_students['current_career_path_id'] = np.where(
    cade_students['current_career_path_id'].isnull(), 0, cade_students['current_career_path_id'])
cade_students['time_spent_hrs'] = np.where(cade_students['time_spent_hrs'].isnull(), 0, cade_students['time_spent_hrs'])

In [None]:
cade_students.info()

In [None]:
#removing duplicate rows from cade_student_jobs
cade_student_jobs.head(15)
cade_student_jobs.drop_duplicates()

In [None]:
cade_courses.head(10)

In [None]:
cade_student_jobs.value_counts()

In [None]:
cade_courses.value_counts()

In [None]:
#adding row to courses for students that havent decided to not have null values
undecided = {'career_path_id': 0,
             'career_path_name': 'undecided',
             'hours_to_complete':0}
cade_courses.loc[len(cade_courses)] = undecided


In [None]:
cade_courses.head(11)


In [None]:
# #filling in null values for job_id, num_course_taken
cade_students['job_id'] = cade_students['job_id'].fillna(0)
cade_students['num_course_taken'] = cade_students['num_course_taken'].fillna(0)

# #filling in current_career_path_id and time_spent_hrs with median value
cade_students['current_career_path_id'] = cade_students['current_career_path_id'].fillna(cade_students['current_career_path_id'].median())
cade_students['time_spent_hrs'] = cade_students['time_spent_hrs'].fillna(cade_students['time_spent_hrs'].median())

In [None]:
#inspecting null values after filling
cade_students.isnull().sum()

In [None]:
# #joining tables
merge_clean_cade = pd.merge(cade_students, cade_student_jobs, how='left', left_on='job_id', right_on='job_id')
cleaned_cade = pd.merge(merge_clean_cade, cade_courses, how='left', left_on='current_career_path_id', right_on='career_path_id')


In [None]:
# #making sure all columns were merged and there there is no missing
cleaned_cade.info()
# con.close()

In [None]:
# creating the output CSV with cleaned data
sqlite_con = sqlite3.connect('cleaned_cade.db')
cleaned_cade.to_sql('cade_concat', sqlite_con, if_exists='replace' , index=False)

In [None]:
cade_data = pd.read_sql_query('SELECT * FROM cade_concat', sqlite_con)

In [None]:
cade_data.info()

In [None]:
missing_data.to_sql('missing_info', sqlite_con, if_exists='replace', index=False)

In [None]:
missing_data = pd.read_sql_query('SELECT * FROM missing_info', sqlite_con)

In [None]:
# sqlite_con.close()

In [None]:
cade_data.to_csv('cleaned_cade.csv')