# RO2
1. Examining factors that influence working over the required working hours and under the required working hours.

**2. Conducting association rule analysis to identify factors influencing the mismatch between highest qualifications and employment qualifications.**

3. Investigating how individuals address the skill gap when they lack the necessary qualifications in their current profession

In [1]:
import pandas  as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Prepare data

In [2]:
df = pd.read_csv('data_sorted.csv')
codebook = pd.read_excel('PIAAC\Cycle 1\International Codebook_PIAAC Public-use File (PUF) Variables and Values.xlsx')

print(df.shape)
df.head()

  df = pd.read_csv('data_sorted.csv')


(11470, 93)


Unnamed: 0,Skill use work - ICT - Computer - How often - Programming language,ISCO1C,Skill use work - ICT - Computer - How often - Real-time discussions,Skill use work - How often - Organising own time,Skill use everyday life - ICT - Computer - How often - Word,Skill use work - How often - Planning others activities,PVPSL6,About yourself - Learning strategies - Attribute something new,ICTHOME,PVPSL8,...,Education - Highest qualification - Level,PVPSL4,PVPSL10,Skill use everyday life - ICT - Internet - How often - In order to better understand various issues,About yourself - Learning strategies - Get to the bottom of difficult things,WorkHours,WorkHours_Difference,Qualification_Comparision,Qualification_Status,PVPSL_Performance
0,At least once a week but not every day,Service workers and shop and market sales workers,Less than once a month,Every day,Less than once a week but at least once a month,Never,315.18911,To a high extent,2.02194,296.26412,...,"ISCED 5A, bachelor degree",342.29315,318.98045,Less than once a week but at least once a month,To a high extent,Over Average,18,Over by 5 level(s),Over,Moderate performer
1,Less than once a week but at least once a month,Professionals,Never,Every day,Never,Less than once a month,326.64999,To some extent,1.646234,273.53912,...,"ISCED 5A, bachelor degree",325.28718,296.98809,At least once a week but not every day,To a high extent,Over Average,8,Equal,Equal,Moderate performer
2,Never,"Legislators, senior officials and managers",At least once a week but not every day,Every day,At least once a week but not every day,At least once a week but not every day,285.41693,To some extent,3.430657,279.23163,...,"ISCED 5A, bachelor degree",316.26258,249.59748,Every day,Very little,Over Average,18,Under by 2 level(s),Under,Weak performer
3,Never,Service workers and shop and market sales workers,Never,Less than once a month,Less than once a month,Never,332.5534,To a high extent,1.623673,284.82134,...,ISCED 5B,318.41257,325.03424,At least once a week but not every day,To some extent,Under Average,-15,Over by 5 level(s),Over,Moderate performer
4,Never,Technicians and associate professionals,Never,Never,Never,Never,300.41854,To some extent,1.365424,290.70099,...,ISCED 3C 2 years or more,307.69612,307.37447,Less than once a month,Very little,Under Average,-10,Over by 1 level(s),Over,Moderate performer


In [3]:
# variables

# dropped AGE_R, ISCO2C
demographic_var = ['CNTRYID', 'GENDER_R', 'LNG_HOME', 'NATIVELANG', 'NATIVESPEAKER', 'J_Q06b', 'J_Q07b', 'I_Q08', 'J_Q04a']
eduwork_var = ['CNTRYID', 'C_Q07', 'C_D06', 'B_Q01a', 'B_Q01a3', 'B_Q01b', 'D_Q12a', 'B_Q05c', 'B_Q10c', 'D_Q14', 'ISCOSKIL4', 'ISCO1C', 'D_Q10', 'D_Q03', 'D_Q04', 'D_Q07a', 'D_Q07b', 'D_Q08a', 'D_Q08b', 'VET', 'NFEHRS']

worksl_var = ['CNTRYID', 'ICTWORK', 'NFEHRSJR']
worksl_var.extend(codebook.loc[codebook['Name'].str.startswith('F_'), 'Name'].tolist())
worksl_var.extend(codebook.loc[codebook['Name'].str.startswith('G_') & codebook['Label'].str.contains('ICT'), 'Name'].tolist())

lifesl_var = ['CNTRYID', 'ICTHOME', 'NFEHRSNJR']
lifesl_var.extend(codebook.loc[codebook['Name'].str.startswith('H_') & codebook['Label'].str.contains('ICT'), 'Name'].tolist())
lifesl_var.extend(codebook.loc[codebook['Name'].str.startswith('I_') & codebook['Label'].str.contains('Learning'), 'Name'].tolist())

pvpsl = []
pvpsl.extend(codebook.loc[codebook['Name'].str.startswith('PVPSL'), 'Name'].tolist())


vars = list(set(demographic_var + eduwork_var + worksl_var + lifesl_var + pvpsl))
vars_keep = [var for var in vars if not (var[0].isalpha() and var[1] == '_')]
vars_change = [var for var in vars if var[0].isalpha() and var[1] == '_']

vars_dict = {}

for i in vars_change:
    match = codebook.loc[codebook['Name'] == i, 'Label'].tolist()
    if match:
        vars_dict[i] = match[0]


demographic_mapped = [vars_dict.get(value, value) if value is not None else None for value in demographic_var]
eduwork_mapped = [vars_dict.get(value, value) if value is not None else None for value in eduwork_var]
worksl_mapped = [vars_dict.get(value, value) if value is not None else None for value in worksl_var]
lifesl_mapped = [vars_dict.get(value, value) if value is not None else None for value in lifesl_var]

hours_var = ['WorkHours', 'WorkHours_Difference']
match_var = ['Qualification_Comparision', 'Qualification_Status']
performance_var = 'PVPSL_Performance'

print(len(demographic_mapped) + len(eduwork_mapped) + len(worksl_mapped) + len(lifesl_mapped) + len(hours_var) + len(match_var) + len(performance_var))

100


# Data Setup

In [4]:
sns.set_palette("dark")
sns.set_style('darkgrid')
sns.set(font_scale=1.1)

In [5]:
print(df.shape)
df['Qualification_Status'].value_counts()

(11470, 93)


Qualification_Status
Equal    4966
Over     4834
Under    1670
Name: count, dtype: int64

In [6]:
drop = pvpsl
drop.append('Qualification_Comparision')


# df2 = df[df['Qualification_Status'] != 'Equal']
df2 = df.drop(columns=drop)

display(df2['Qualification_Status'].value_counts())
display(df2.shape)

Qualification_Status
Equal    4966
Over     4834
Under    1670
Name: count, dtype: int64

(11470, 82)

In [7]:
over_df = df2[df2['Qualification_Status'] == 'Over']
under_df = df2[df2['Qualification_Status'] == 'Under']
equal_df = df2[df2['Qualification_Status'] == 'Equal']

print(over_df.shape)
print(under_df.shape)
print(equal_df.shape)

(4834, 82)
(1670, 82)
(4966, 82)


# ARM with MlExtend

In [8]:
from apyori import apriori
import mlxtend.frequent_patterns as fp

results_path = "RQ2/"

In [9]:
target = 'Qualification_Status'

# get only the 1 cons

def refineRules(type, assoc_rules, support, threshold):
    filename_con_all = results_path + type + '/ARM_consequents_' + str(support) + '_' + str(threshold) + '.csv'
    filename_con = results_path + type + '/ARM_consequents_ONE_' + str(support) + '_' + str(threshold) + '.csv'

    con_all_condition = assoc_rules['consequents'].apply(lambda x: any(target in str(item) for item in x))
    con_one_condition = assoc_rules['consequents'].apply(lambda x: any(target in str(item) for item in x) and len(x) == 1)

    consequents_all = assoc_rules[con_all_condition]
    consequents_one = assoc_rules[con_one_condition]
    
    consequents_all.to_csv(filename_con_all, index=False)
    consequents_one.to_csv(filename_con, index=False)

In [10]:
def makeAssocRules(type, data, support, threshold):
    # filename_growth = results_path + type + '/ARM_Growth_' + str(support) + '.csv'

    fp_apriori = fp.apriori(data,min_support=support,use_colnames=True)
    # fp_fpgrowth = fp.fpgrowth(data, min_support=support, use_colnames=True)
    
    assoc_rules = fp.association_rules(fp_apriori,metric='confidence', min_threshold=threshold,support_only=False)
    
    # fp_fpgrowth.to_csv(filename_growth, index=False)
    
    refineRules(type, assoc_rules, support, threshold)

In [11]:
# Over
over_onehot = pd.get_dummies(over_df, columns=over_df.columns)
display(over_onehot.head(5))
display(over_onehot.shape)

# Under
under_onehot = pd.get_dummies(under_df, columns=under_df.columns)
display(under_onehot.head(5))
display(under_onehot.shape)

# Equal
equal_onehot = pd.get_dummies(equal_df, columns=equal_df.columns)
display(equal_onehot.head(5))
display(equal_onehot.shape)


Unnamed: 0,Skill use work - ICT - Computer - How often - Programming language_At least once a week but not every day,Skill use work - ICT - Computer - How often - Programming language_Every day,Skill use work - ICT - Computer - How often - Programming language_Less than once a month,Skill use work - ICT - Computer - How often - Programming language_Less than once a week but at least once a month,Skill use work - ICT - Computer - How often - Programming language_Never,Skill use work - ICT - Computer - How often - Programming language_Not stated or inferred,ISCO1C_Armed forces,ISCO1C_Clerks,ISCO1C_Craft and related trades workers,ISCO1C_Don't know,...,WorkHours_Difference_52,WorkHours_Difference_58,WorkHours_Difference_63,WorkHours_Difference_64,WorkHours_Difference_70,Qualification_Status_Over,PVPSL_Performance_At risk,PVPSL_Performance_Moderate performer,PVPSL_Performance_Strong performer,PVPSL_Performance_Weak performer
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
6,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,True,True,False,False,False
9,False,False,False,False,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,False


(4834, 5067)

Unnamed: 0,Skill use work - ICT - Computer - How often - Programming language_At least once a week but not every day,Skill use work - ICT - Computer - How often - Programming language_Every day,Skill use work - ICT - Computer - How often - Programming language_Less than once a month,Skill use work - ICT - Computer - How often - Programming language_Less than once a week but at least once a month,Skill use work - ICT - Computer - How often - Programming language_Never,Skill use work - ICT - Computer - How often - Programming language_Not stated or inferred,ISCO1C_Armed forces,ISCO1C_Clerks,ISCO1C_Craft and related trades workers,ISCO1C_Don't know,...,WorkHours_Difference_29,WorkHours_Difference_32,WorkHours_Difference_36,WorkHours_Difference_42,WorkHours_Difference_58,Qualification_Status_Under,PVPSL_Performance_At risk,PVPSL_Performance_Moderate performer,PVPSL_Performance_Strong performer,PVPSL_Performance_Weak performer
2,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
12,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
13,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,True,True,False,False,False
18,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
20,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False


(1670, 2835)

Unnamed: 0,Skill use work - ICT - Computer - How often - Programming language_At least once a week but not every day,Skill use work - ICT - Computer - How often - Programming language_Every day,Skill use work - ICT - Computer - How often - Programming language_Less than once a month,Skill use work - ICT - Computer - How often - Programming language_Less than once a week but at least once a month,Skill use work - ICT - Computer - How often - Programming language_Never,Skill use work - ICT - Computer - How often - Programming language_Not stated or inferred,ISCO1C_Armed forces,ISCO1C_Clerks,ISCO1C_Craft and related trades workers,ISCO1C_Don't know,...,WorkHours_Difference_47,WorkHours_Difference_48,WorkHours_Difference_52,WorkHours_Difference_53,WorkHours_Difference_58,Qualification_Status_Equal,PVPSL_Performance_At risk,PVPSL_Performance_Moderate performer,PVPSL_Performance_Strong performer,PVPSL_Performance_Weak performer
1,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
5,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
7,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
8,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
11,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False


(4966, 5206)

In [12]:
support = 0.4
thresh_range = 1
threshold = 0.5

In [13]:
makeAssocRules('Over', over_onehot, support, threshold)
print('Over = COMPLETED') # 85 mins

makeAssocRules('Under', under_onehot, support, threshold)
print('Under = COMPLETED')

makeAssocRules('Equal', equal_onehot, support, threshold)
print('Equal = COMPLETED')

Over = COMPLETED
Under = COMPLETED
Equal = COMPLETED


In [14]:
# for i in range(supp_range, 10, 1):
#     support = i / 10
    
#     # for j in range(thresh_range, 10, 1):
#         # threshold = j / 10

#     makeAssocRules('Over', over_onehot, support, threshold)

In [15]:
# for i in range(supp_range, 10, 1):
#     support = i / 10
    
#     # for j in range(thresh_range, 10, 1):
#     #     threshold = j / 10
        
#     makeAssocRules('Under', under_onehot, support, threshold)
#     print('Support : ', support, ' [COMPLETED]')

In [16]:
# for i in range(supp_range, 10, 1):
#     support = i / 10
    
#     # for j in range(thresh_range, 10, 1):
#     #     threshold = j / 10
        
#     makeAssocRules('Equal', equal_onehot, support, threshold)
#     print('Support : ', support, ' [COMPLETED]')