In [11]:
import sqlite3
import pandas as pd
import numpy as np
import json
import datetime

In [12]:
#Connecting to SQLite3 Database
con = sqlite3.connect('cademycode.db')
cur = con.cursor()

#determing table names
table_list = [a for a in cur.execute("SELECT name FROM sqlite_master WHERE type = 'table'")]
print(table_list)

[]


In [9]:
#reading tables as dataframes
cade_students = pd.read_sql_query('SELECT * FROM cademycode_students', con)
cade_courses = pd.read_sql_query('SELECT * FROM cademycode_courses', con)
cade_student_jobs = pd.read_sql_query('SELECT * FROM cademycode_student_jobs', con)

DatabaseError: Execution failed on sql 'SELECT * FROM cademycode_student': no such table: cademycode_student

In [None]:
print('cade_students', len(cade_students))
print('cade_courses', len(cade_students))
print('cade_student_jobs', len(cade_students))

cade_students 5000
cade_courses 5000
cade_student_jobs 5000


In [None]:
#working with student table
cade_students.head(5)

Unnamed: 0,uuid,name,dob,sex,contact_info,job_id,num_course_taken,current_career_path_id,time_spent_hrs
0,1,Annabelle Avery,1943-07-03,F,"{""mailing_address"": ""303 N Timber Key, Irondal...",7.0,6.0,1.0,4.99
1,2,Micah Rubio,1991-02-07,M,"{""mailing_address"": ""767 Crescent Fair, Shoals...",7.0,5.0,8.0,4.4
2,3,Hosea Dale,1989-12-07,M,"{""mailing_address"": ""P.O. Box 41269, St. Bonav...",7.0,8.0,8.0,6.74
3,4,Mariann Kirk,1988-07-31,F,"{""mailing_address"": ""517 SE Wintergreen Isle, ...",6.0,7.0,9.0,12.31
4,5,Lucio Alexander,1963-08-31,M,"{""mailing_address"": ""18 Cinder Cliff, Doyles b...",7.0,14.0,3.0,5.64


In [None]:
#examining columns and checking for any null values
cade_students.info()
#contact info is a dictionary will need to explode into seperate columns
#each row has a UUID whcih means one student can only be one row
#none of the numerical columns are coming in as floats or integers
#missing data in job_id, num_course_taken, current_career path_id and time_spent_hours

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   uuid                    5000 non-null   int64 
 1   name                    5000 non-null   object
 2   dob                     5000 non-null   object
 3   sex                     5000 non-null   object
 4   contact_info            5000 non-null   object
 5   job_id                  4995 non-null   object
 6   num_course_taken        4749 non-null   object
 7   current_career_path_id  4529 non-null   object
 8   time_spent_hrs          4529 non-null   object
dtypes: int64(1), object(8)
memory usage: 351.7+ KB


In [None]:
def extract_contact_info(contact_info):
    try:
        info = json.loads(contact_info.replace("'", "'"))
        return pd.Series([info.get('mailing_address'), info.get('email')])
    except json.JSONDecodeError:
        return pd.Series([None, None])

In [None]:
cade_students[['mailing_address', 'email']] = cade_students['contact_info'].apply(extract_contact_info)

In [None]:
cade_students.drop(columns=['contact_info'], inplace=True)

In [None]:
cade_students.head(10)

Unnamed: 0,uuid,name,dob,sex,job_id,num_course_taken,current_career_path_id,time_spent_hrs,mailing_address,email
0,1,Annabelle Avery,1943-07-03,F,7.0,6.0,1.0,4.99,"303 N Timber Key, Irondale, Wisconsin, 84736",annabelle_avery9376@woohoo.com
1,2,Micah Rubio,1991-02-07,M,7.0,5.0,8.0,4.4,"767 Crescent Fair, Shoals, Indiana, 37439",rubio6772@hmail.com
2,3,Hosea Dale,1989-12-07,M,7.0,8.0,8.0,6.74,"P.O. Box 41269, St. Bonaventure, Virginia, 83637",hosea_dale8084@coldmail.com
3,4,Mariann Kirk,1988-07-31,F,6.0,7.0,9.0,12.31,"517 SE Wintergreen Isle, Lane, Arkansas, 82242",kirk4005@hmail.com
4,5,Lucio Alexander,1963-08-31,M,7.0,14.0,3.0,5.64,"18 Cinder Cliff, Doyles borough, Rhode Island,...",alexander9810@hmail.com
5,6,Shavonda Mcmahon,1989-10-15,F,6.0,10.0,3.0,10.12,"P.O. Box 81591, Tarpon Springs, Montana, 37057",shavonda5863@coldmail.com
6,7,Terrell Bleijenberg,1959-05-05,M,2.0,9.0,8.0,24.17,"P.O. Box 53471, Oskaloosa, Virginia, 85274",bleijenberg188@hmail.com
7,8,Stanford Allan,1997-11-22,M,3.0,3.0,1.0,19.54,"255 Spring Avenue, Point Baker, Texas, 15796",stanford_allan8055@coldmail.com
8,9,Tricia Delacruz,1961-10-20,F,1.0,6.0,9.0,1.75,"997 Dewy Apple, Lake Lindsey, Washington, 78266",tricia_delacruz6622@woohoo.com
9,10,Regenia van der Helm,1999-02-23,N,5.0,7.0,6.0,13.55,"220 Middle Ridge, Falcon Heights, New Mexico, ...",regenia6908@inlook.com


In [None]:
#splitting mailing_address to address, city and state
split_mailing_address = cade_students.mailing_address.str.split(',', expand=True)
split_mailing_address.columns = ['address', 'city', 'state', 'zip_code']
cade_students = pd.concat([cade_students.drop('mailing_address', axis=1), split_mailing_address], axis=1)

In [None]:
#converting datatypes for dob, job_id, num_course_taken, current_career_path, time_spent_hrs
cade_students['job_id'] = cade_students['job_id'].astype(float)
cade_students['num_course_taken'] = cade_students['num_course_taken'].astype(float)
cade_students['current_career_path_id'] = cade_students['current_career_path_id'].astype(float)
cade_students['time_spent_hrs'] = cade_students['time_spent_hrs'].astype(float)

#checking if changes are correct
cade_students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uuid                    5000 non-null   int64  
 1   name                    5000 non-null   object 
 2   dob                     5000 non-null   object 
 3   sex                     5000 non-null   object 
 4   job_id                  4995 non-null   float64
 5   num_course_taken        4749 non-null   float64
 6   current_career_path_id  4529 non-null   float64
 7   time_spent_hrs          4529 non-null   float64
 8   email                   5000 non-null   object 
 9   address                 5000 non-null   object 
 10  city                    5000 non-null   object 
 11  state                   5000 non-null   object 
 12  zip_code                5000 non-null   object 
dtypes: float64(4), int64(1), object(8)
memory usage: 507.9+ KB


In [None]:
#handling missing data for num_course_take 
missing_courses_taken = cade_students[cade_students[['num_course_taken']].isnull().any(axis=1)]
display(missing_courses_taken)

Unnamed: 0,uuid,name,dob,sex,job_id,num_course_taken,current_career_path_id,time_spent_hrs,email,address,city,state,zip_code
25,26,Doug Browning,1970-06-08,M,7.0,,5.0,1.92,doug7761@inlook.com,P.O. Box 15845,Devine,Florida,23097
26,27,Damon Schrauwen,1953-10-31,M,4.0,,10.0,3.73,damon9864@woohoo.com,P.O. Box 84659,Maben,Georgia,66137
51,52,Alisa Neil,1977-05-28,F,5.0,,8.0,22.86,alisa9616@inlook.com,16 View Annex,Mosses,North Dakota,25748
70,71,Chauncey Hooper,1962-04-07,M,3.0,,3.0,3.97,chauncey6352@woohoo.com,955 Dewy Flat,Slaughterville,South Carolina,22167
80,81,Ellyn van Heest,1984-06-28,F,3.0,,10.0,12.39,ellyn_vanheest8375@hmail.com,872 Cider Glade,Chicken,Delaware,42689
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4889,4890,Tegan Cochran,1970-11-08,F,5.0,,8.0,22.75,tegan130@inlook.com,106 Sunny Nook,Vernal,Georgia,10769
4898,4899,Ruthann Oliver,1998-05-22,F,3.0,,7.0,21.27,ruthann1124@woohoo.com,644 Merry Island,Green Valley,Wyoming,91273
4914,4915,Ernest Holmes,1995-03-11,M,7.0,,9.0,26.50,ernest_holmes505@hmail.com,872 Wintergreen Harbor,Gallitzin borough,Maine,50103
4980,4981,Brice Franklin,1946-12-01,M,4.0,,5.0,8.66,brice9741@coldmail.com,947 Panda Way,New Bedford village,Vermont,31232


In [None]:
#will store the missing data in a seperate table for future use
missing_data = pd.DataFrame()
missing_data = pd.concat([missing_data, missing_courses_taken])
cade_students = cade_students.dropna(subset=['num_course_taken'])

In [None]:
cade_students.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4749 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uuid                    4749 non-null   int64  
 1   name                    4749 non-null   object 
 2   dob                     4749 non-null   object 
 3   sex                     4749 non-null   object 
 4   job_id                  4744 non-null   float64
 5   num_course_taken        4749 non-null   float64
 6   current_career_path_id  4298 non-null   float64
 7   time_spent_hrs          4298 non-null   float64
 8   email                   4749 non-null   object 
 9   address                 4749 non-null   object 
 10  city                    4749 non-null   object 
 11  state                   4749 non-null   object 
 12  zip_code                4749 non-null   object 
dtypes: float64(4), int64(1), object(8)
memory usage: 519.4+ KB


In [None]:
missing_job = cade_students[cade_students[['job_id']].isnull().any(axis=1)]
display(missing_job)

Unnamed: 0,uuid,name,dob,sex,job_id,num_course_taken,current_career_path_id,time_spent_hrs,email,address,city,state,zip_code
162,163,Glen Riley,2002-08-22,M,,8.0,3.0,5.7,glen_riley4484@hmail.com,P.O. Box 37267,Cornlea village,Tennessee,19192
757,758,Mercedez Vorberg,2002-03-25,F,,15.0,4.0,4.14,mercedez6297@woohoo.com,284 Cedar Seventh,Virden village,Washington,60489
854,855,Kurt Ho,2002-05-29,M,,0.0,8.0,23.72,ho6107@inlook.com,P.O. Box 27254,Olin,New Hampshire,60067
1029,1030,Penny Gaines,2002-03-01,N,,15.0,4.0,16.25,gaines2897@hmail.com,138 Misty Vale,Stockton borough,West Virginia,53630
1542,1543,Frederick Reilly,2002-11-13,M,,7.0,9.0,21.32,frederick_reilly6971@woohoo.com,P.O. Box 40769,Quakervillage,Maryland,96218


In [None]:
missing_data = pd.concat([missing_data, missing_job])
cade_students = cade_students.dropna(subset=['job_id'])

In [None]:
#current_career_path missing data
missing_career = cade_students[cade_students[['current_career_path_id']].isnull().any(axis=1)]
display(missing_career)

Unnamed: 0,uuid,name,dob,sex,job_id,num_course_taken,current_career_path_id,time_spent_hrs,email,address,city,state,zip_code
15,16,Norene Dalton,1976-04-30,F,6.0,0.0,,,norene_dalton9509@hmail.com,130 Wishing Essex,Branch,Ohio,13616
19,20,Sofia van Steenbergen,1990-02-21,N,7.0,13.0,,,vansteenbergen8482@inlook.com,634 Clear Barn Dell,Beaman,Georgia,33288
30,31,Christoper Warner,1989-12-28,M,2.0,5.0,,,warner5906@coldmail.com,556 Stony Highlands,Drain,Illinois,01973
49,50,Antony Horne,1996-05-29,M,3.0,2.0,,,antony577@coldmail.com,P.O. Box 78685,Lenox,Texas,15516
54,55,Omar Bunk,1955-11-08,M,3.0,14.0,,,omar1245@coldmail.com,445 Dale Hollow,Vermont village,South Carolina,28329
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,4905,Eduardo Daniel,2004-06-18,M,8.0,12.0,,,daniel5073@inlook.com,598 Deer Trace,Forest Grove,North Carolina,45038
4922,4923,Francisco van Ede,1961-04-26,M,7.0,5.0,,,vanede7845@coldmail.com,282 Fourth Trace,Carter Lake,Ohio,63511
4948,4949,Dewitt van Malsem,1949-03-08,M,4.0,7.0,,,dewitt4635@inlook.com,423 Course Trail,Wilmot,Hawaii,18996
4956,4957,Todd Stamhuis,1961-06-15,M,7.0,8.0,,,todd8019@woohoo.com,251 Grand Rose Underpass,Niagara,Pennsylvania,03430


In [None]:
missing_career.info()

<class 'pandas.core.frame.DataFrame'>
Index: 451 entries, 15 to 4974
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uuid                    451 non-null    int64  
 1   name                    451 non-null    object 
 2   dob                     451 non-null    object 
 3   sex                     451 non-null    object 
 4   job_id                  451 non-null    float64
 5   num_course_taken        451 non-null    float64
 6   current_career_path_id  0 non-null      float64
 7   time_spent_hrs          0 non-null      float64
 8   email                   451 non-null    object 
 9   address                 451 non-null    object 
 10  city                    451 non-null    object 
 11  state                   451 non-null    object 
 12  zip_code                451 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 49.3+ KB


In [None]:
#creating new id to incicate no career and set its time spent to zero
cade_students['current_career_path_id'] = np.where(
    cade_students['current_career_path_id'].isnull(), 0, cade_students['current_career_path_id'])
cade_students['time_spent_hrs'] = np.where(cade_students['time_spent_hrs'].isnull(), 0, cade_students['time_spent_hrs'])

In [None]:
cade_students.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4744 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uuid                    4744 non-null   int64  
 1   name                    4744 non-null   object 
 2   dob                     4744 non-null   object 
 3   sex                     4744 non-null   object 
 4   job_id                  4744 non-null   float64
 5   num_course_taken        4744 non-null   float64
 6   current_career_path_id  4744 non-null   float64
 7   time_spent_hrs          4744 non-null   float64
 8   email                   4744 non-null   object 
 9   address                 4744 non-null   object 
 10  city                    4744 non-null   object 
 11  state                   4744 non-null   object 
 12  zip_code                4744 non-null   object 
dtypes: float64(4), int64(1), object(8)
memory usage: 518.9+ KB


In [None]:
#removing duplicate rows from cade_student_jobs
cade_student_jobs.head(15)
cade_student_jobs.drop_duplicates()

Unnamed: 0,job_id,job_category,avg_salary
0,1,analytics,86000
1,2,engineer,101000
2,3,software developer,110000
3,4,creative,66000
4,5,financial services,135000
5,6,education,61000
6,7,HR,80000
7,8,student,10000
8,9,healthcare,120000
9,0,other,80000


In [None]:
cade_courses.head(10)

Unnamed: 0,career_path_id,career_path_name,hours_to_complete
0,1,data scientist,20
1,2,data engineer,20
2,3,data analyst,12
3,4,software engineering,25
4,5,backend engineer,18
5,6,frontend engineer,20
6,7,iOS developer,27
7,8,android developer,27
8,9,machine learning engineer,35
9,10,ux/ui designer,15


In [None]:
cade_student_jobs.value_counts()

job_id  job_category        avg_salary
3       software developer  110000        2
4       creative            66000         2
5       financial services  135000        2
0       other               80000         1
1       analytics           86000         1
2       engineer            101000        1
6       education           61000         1
7       HR                  80000         1
8       student             10000         1
9       healthcare          120000        1
Name: count, dtype: int64

In [None]:
cade_courses.value_counts()

career_path_id  career_path_name           hours_to_complete
1               data scientist             20                   1
2               data engineer              20                   1
3               data analyst               12                   1
4               software engineering       25                   1
5               backend engineer           18                   1
6               frontend engineer          20                   1
7               iOS developer              27                   1
8               android developer          27                   1
9               machine learning engineer  35                   1
10              ux/ui designer             15                   1
Name: count, dtype: int64

In [None]:
#adding row to courses for students that havent decided to not have null values
undecided = {'career_path_id': 0,
             'career_path_name': 'undecided',
             'hours_to_complete':0}
cade_courses.loc[len(cade_courses)] = undecided


In [None]:
cade_courses.head(11)


Unnamed: 0,career_path_id,career_path_name,hours_to_complete
0,1,data scientist,20
1,2,data engineer,20
2,3,data analyst,12
3,4,software engineering,25
4,5,backend engineer,18
5,6,frontend engineer,20
6,7,iOS developer,27
7,8,android developer,27
8,9,machine learning engineer,35
9,10,ux/ui designer,15


In [None]:
# #filling in null values for job_id, num_course_taken
cade_students['job_id'] = cade_students['job_id'].fillna(0)
cade_students['num_course_taken'] = cade_students['num_course_taken'].fillna(0)

# #filling in current_career_path_id and time_spent_hrs with median value
cade_students['current_career_path_id'] = cade_students['current_career_path_id'].fillna(cade_students['current_career_path_id'].median())
cade_students['time_spent_hrs'] = cade_students['time_spent_hrs'].fillna(cade_students['time_spent_hrs'].median())

In [None]:
#inspecting null values after filling
cade_students.isnull().sum()

uuid                      0
name                      0
dob                       0
sex                       0
job_id                    0
num_course_taken          0
current_career_path_id    0
time_spent_hrs            0
email                     0
address                   0
city                      0
state                     0
zip_code                  0
dtype: int64

In [None]:
# #joining tables
merge_clean_cade = pd.merge(cade_students, cade_student_jobs, how='left', left_on='job_id', right_on='job_id')
cleaned_cade = pd.merge(merge_clean_cade, cade_courses, how='left', left_on='current_career_path_id', right_on='career_path_id')


In [None]:
# #making sure all columns were merged and there there is no missing
cleaned_cade.info()
con.close()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6648 entries, 0 to 6647
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uuid                    6648 non-null   int64  
 1   name                    6648 non-null   object 
 2   dob                     6648 non-null   object 
 3   sex                     6648 non-null   object 
 4   job_id                  6648 non-null   float64
 5   num_course_taken        6648 non-null   float64
 6   current_career_path_id  6648 non-null   float64
 7   time_spent_hrs          6648 non-null   float64
 8   email                   6648 non-null   object 
 9   address                 6648 non-null   object 
 10  city                    6648 non-null   object 
 11  state                   6648 non-null   object 
 12  zip_code                6648 non-null   object 
 13  job_category            6648 non-null   object 
 14  avg_salary              6648 non-null   

In [None]:
# creating the output CSV with cleaned data
sqlite_con = sqlite3.connect('cleaned_cade.db')
cleaned_cade.to_sql('cade_concat', sqlite_con, if_exists='replace' , index=False)

6648

In [None]:
cade_data = pd.read_sql_query('SELECT * FROM cade_concat', sqlite_con)

In [None]:
cade_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6648 entries, 0 to 6647
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uuid                    6648 non-null   int64  
 1   name                    6648 non-null   object 
 2   dob                     6648 non-null   object 
 3   sex                     6648 non-null   object 
 4   job_id                  6648 non-null   float64
 5   num_course_taken        6648 non-null   float64
 6   current_career_path_id  6648 non-null   float64
 7   time_spent_hrs          6648 non-null   float64
 8   email                   6648 non-null   object 
 9   address                 6648 non-null   object 
 10  city                    6648 non-null   object 
 11  state                   6648 non-null   object 
 12  zip_code                6648 non-null   object 
 13  job_category            6648 non-null   object 
 14  avg_salary              6648 non-null   

In [None]:
missing_data.to_sql('missing_info', sqlite_con, if_exists='replace', index=False)

256

In [None]:
missing_data = pd.read_sql_query('SELECT * FROM missing_info', sqlite_con)

In [None]:
sqlite_con.close()

In [None]:
cade_data.to_csv('cleaned_cade.csv')