In [5]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load the Excel file
xls = pd.ExcelFile('https://github.com/saifrahmania/Data36118/raw/refs/heads/main/Assignment1/Data/ASCDataset/Australian%20Skills%20Classification%20-%20December%202023.xlsx')

# Dictionary to hold all DataFrames, one for each sheet
sheets_dict = {}

for sheet_name in xls.sheet_names:
    # Load each sheet into a DataFrame
    sheets_dict[sheet_name] = pd.read_excel(xls, sheet_name=sheet_name)

xls.sheet_names

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['Index',
 'Glossary',
 'Occupation descriptions',
 'Core competency descriptions',
 'Specialist tasks hierarchy',
 'Tech tools heirarchy',
 'Core competencies',
 'Specialist tasks data',
 'Technology tools',
 'Appendix - tech tool examples',
 'Appendix - common tech tools']

In [6]:
for sheet_name in xls.sheet_names:
    print(f"Columns in sheet '{sheet_name}':")
    print(sheets_dict[sheet_name].columns.tolist())
    print("-" * 20)

Columns in sheet 'Index':
['Unnamed: 0', 'Unnamed: 1']
--------------------
Columns in sheet 'Glossary':
['Glossary of key terms', 'Unnamed: 1']
--------------------
Columns in sheet 'Occupation descriptions':
['Occupation Type', 'ANZSCO Code', 'Sub-Profile Code', 'ANZSCO Title', 'ANZSCO Description']
--------------------
Columns in sheet 'Core competency descriptions':
['Core Competency', 'Core Competency Description', 'Score', 'Proficiency Level', 'Anchor Value']
--------------------
Columns in sheet 'Specialist tasks hierarchy':
['Specialist Task', 'Specialist Cluster', 'Cluster Family', 'Skill Statement']
--------------------
Columns in sheet 'Tech tools heirarchy':
['Technology Tool Category', 'Technology Tool Category Description', 'Technology Tool', 'Technology Tool Description', 'Technology Tool Extended Description']
--------------------
Columns in sheet 'Core competencies':
['Occupation Type', 'ANZSCO Code', 'Sub-Profile Code', 'ANZSCO Title', 'Core Competency', 'Score', 'Pro

In [7]:
sheets_dict = {}

for sheet_name in xls.sheet_names:
    # Load each sheet into a DataFrame
    sheets_dict[sheet_name] = pd.read_excel(xls, sheet_name=sheet_name)

# Accessing specific sheets and their data
index_df = sheets_dict['Index']
glossary_df = sheets_dict['Glossary']
occupation_descriptions_df = sheets_dict['Occupation descriptions']
core_competency_descriptions_df = sheets_dict['Core competency descriptions']
specialist_tasks_hierarchy_df = sheets_dict['Specialist tasks hierarchy']
tech_tools_hierarchy_df = sheets_dict['Tech tools heirarchy']
core_competencies_df = sheets_dict['Core competencies']
specialist_tasks_data_df = sheets_dict['Specialist tasks data']
technology_tools_df = sheets_dict['Technology tools']
appendix_tech_tool_examples_df = sheets_dict['Appendix - tech tool examples']
appendix_common_tech_tools_df = sheets_dict['Appendix - common tech tools']

# Now you can work with each DataFrame individually
# Example: Print the first 5 rows of the 'Occupation descriptions' sheet
index_df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1
0,Australian Skills Classification,
1,Version 3.0 - current at December 2023 - updat...,
2,Index,
3,Glossary,Explanation of key terms used in this dataset.
4,Occupation descriptions,ANZSCO Occupation codes and descriptions.\nSub...


In [8]:
# prompt: column names of core_competiencies_df

print(core_competencies_df.columns.tolist())


['Occupation Type', 'ANZSCO Code', 'Sub-Profile Code', 'ANZSCO Title', 'Core Competency', 'Score', 'Proficiency Level', 'Anchor Value']


In [9]:
# prompt: drop 'Sub-Profile Code' from 'specialist_tasks_data_df' and 'core_competencies_df'

try:
  core_competencies_df = core_competencies_df.drop(columns=['Sub-Profile Code'])
except KeyError:
  print("Column 'Sub-Profile Code' not found in 'core_competencies_df'")

try:
  specialist_tasks_data_df = specialist_tasks_data_df.drop(columns=['Sub-Profile Code'])
except KeyError:
  print("Column 'Sub-Profile Code' not found in 'specialist_tasks_data_df'")


In [10]:
import pandas as pd

# Assuming specialist_tasks_data_df and core_competencies_df are loaded DataFrames

# First, let's ensure there are no duplicates within core_competencies_df that could cause multiple matches
core_competencies_df = core_competencies_df.drop_duplicates(subset=['Occupation Type', 'ANZSCO Title'], keep='first')

# Perform a left join with core_competencies_df to append matching data
merged_df = pd.merge(specialist_tasks_data_df, core_competencies_df,
                     on=['Occupation Type', 'ANZSCO Title'],
                     how='left')

# Check and print the number of rows and structure to ensure it matches expectations
print("After merge, DataFrame size: ", merged_df.shape)
print(merged_df.head())

# Optionally, check for any rows that might still have missing data indicating no match was found
unmatched_indicator = merged_df.isna().any(axis=1)
print("Number of unmatched rows: ", unmatched_indicator.sum())

# This approach ensures we do not inadvertently increase the number of rows in specialist_tasks_data_df.



After merge, DataFrame size:  (30450, 16)
  Occupation Type  ANZSCO Code_x                             ANZSCO Title  \
0        ANZSCO 4           1111  Chief Executives and Managing Directors   
1        ANZSCO 4           1111  Chief Executives and Managing Directors   
2        ANZSCO 4           1111  Chief Executives and Managing Directors   
3        ANZSCO 4           1111  Chief Executives and Managing Directors   
4        ANZSCO 4           1111  Chief Executives and Managing Directors   

                                     Specialist Task  % of time spent on task  \
0  Direct or manage financial activities or opera...                   0.1302   
1     Direct department or organisational activities                   0.1117   
2  Direct sales, marketing or customer service ac...                   0.0808   
3  Communicate with others to arrange, coordinate...                   0.0665   
4  Analyse data to assess operational or project ...                   0.0651   

  Emergi

In [11]:
merged_df.shape



(30450, 16)

In [16]:
merged_df.head()

Unnamed: 0,Occupation Type,ANZSCO Code_x,ANZSCO Title,Specialist Task,% of time spent on task,Emerging/\nTrending Flag,Specialist Cluster,% of time spent on cluster,Cluster Family,% of time spent on family,Skills Statement,ANZSCO Code_y,Core Competency,Score,Proficiency Level,Anchor Value
0,ANZSCO 4,1111,Chief Executives and Managing Directors,Direct or manage financial activities or opera...,0.1302,,"Manage, monitor and undertake financial activi...",0.1644,Business operations and financial activities,0.5322,Direct and oversee the financial operations of...,1111.0,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...
1,ANZSCO 4,1111,Chief Executives and Managing Directors,Direct department or organisational activities,0.1117,Trending,"Manage services, staff or activities",0.2128,Business operations and financial activities,0.5322,Direct and oversee the activities of a work un...,1111.0,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...
2,ANZSCO 4,1111,Chief Executives and Managing Directors,"Direct sales, marketing or customer service ac...",0.0808,,"Manage services, staff or activities",0.2128,Business operations and financial activities,0.5322,"Direct and oversee the sales, marketing, or cu...",1111.0,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...
3,ANZSCO 4,1111,Chief Executives and Managing Directors,"Communicate with others to arrange, coordinate...",0.0665,,Communicate or collaborate with others,0.075,Communication and collaboration,0.089,"Coordinate with others in order to plan, organ...",1111.0,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...
4,ANZSCO 4,1111,Chief Executives and Managing Directors,Analyse data to assess operational or project ...,0.0651,,Use data to inform operational decisions,0.1009,"Data, analytics, and databases",0.137,Analyse qualitative and quantitative data aris...,1111.0,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...


In [12]:
# prompt: show the name of c0lumns and the number of empty rows

# Count empty rows in each column of merged_df
empty_rows_per_column = merged_df.isnull().sum()

# Print the column names and the number of empty rows for each
for column, empty_count in empty_rows_per_column.items():
    print(f"Column '{column}': {empty_count} empty rows")


Column 'Occupation Type': 0 empty rows
Column 'ANZSCO Code_x': 0 empty rows
Column 'ANZSCO Title': 0 empty rows
Column 'Specialist Task': 0 empty rows
Column '% of time spent on task': 6516 empty rows
Column 'Emerging/
Trending Flag': 29048 empty rows
Column 'Specialist Cluster': 0 empty rows
Column ' % of time spent on cluster': 6516 empty rows
Column 'Cluster Family': 0 empty rows
Column '% of time spent on family': 6516 empty rows
Column 'Skills Statement': 0 empty rows
Column 'ANZSCO Code_y': 6568 empty rows
Column 'Core Competency': 6568 empty rows
Column 'Score': 6568 empty rows
Column 'Proficiency Level': 6568 empty rows
Column 'Anchor Value': 6568 empty rows


In [21]:
# prompt: drop
# Column 'Emerging/Trending Flag': 29048 empty
# Column 'ANZSCO Code_y': 6568 empty rows
# Rename Column 'ANZSCO Code_x' to 'ANZSCO Code'

# Drop columns with a large number of empty rows
try:
    merged_df = merged_df.drop(columns=['Emerging/\nTrending Flag'])
except KeyError:
    print("Column 'Emerging/Trending Flag' not found in 'merged_df'")

try:
    merged_df = merged_df.drop(columns=['ANZSCO Code_y'])
except KeyError:
    print("Column 'ANZSCO Code_y' not found in 'merged_df'")

# Rename 'ANZSCO Code_x' to 'ANZSCO Code'
try:
    merged_df = merged_df.rename(columns={'ANZSCO Code_x': 'ANZSCO Code'})
except KeyError:
    print("Column 'ANZSCO Code_x' not found in 'merged_df'")


Column 'Emerging/Trending Flag' not found in 'merged_df'
Column 'ANZSCO Code_y' not found in 'merged_df'


In [22]:
# prompt: show the datattype of all column

# Display data types of all columns in the merged_df DataFrame
merged_df.dtypes


Unnamed: 0,0
Occupation Type,object
ANZSCO Code,int64
ANZSCO Title,object
Specialist Task,object
% of time spent on task,float64
Specialist Cluster,object
% of time spent on cluster,float64
Cluster Family,object
% of time spent on family,float64
Skills Statement,object


In [24]:
# prompt: check for Score and Proficiency Level has empty value/null/NaN, where you will find the number of rows where both columns have  empty value/null/NaN, or where one has empty value/null/NaN, either in Score has empty value/null/NaN but Proficiency Level has valueor  or  Score has value but Proficiency Level has empty value/null/NaNand show the values separately

# Check for empty/null/NaN values in 'Score' and 'Proficiency Level' columns

# Count rows where both 'Score' and 'Proficiency Level' are empty/null/NaN
both_empty = merged_df[merged_df['Score'].isnull() & merged_df['Proficiency Level'].isnull()].shape[0]
print(f"Number of rows with both 'Score' and 'Proficiency Level' empty: {both_empty}")

# Count rows where 'Score' is empty/null/NaN but 'Proficiency Level' has a value
score_empty_proficiency_not = merged_df[merged_df['Score'].isnull() & merged_df['Proficiency Level'].notnull()].shape[0]
print(f"Number of rows with 'Score' empty but 'Proficiency Level' not empty: {score_empty_proficiency_not}")

# Count rows where 'Score' has a value but 'Proficiency Level' is empty/null/NaN
score_not_empty_proficiency_empty = merged_df[merged_df['Score'].notnull() & merged_df['Proficiency Level'].isnull()].shape[0]
print(f"Number of rows with 'Score' not empty but 'Proficiency Level' empty: {score_not_empty_proficiency_empty}")

# Count rows where either 'Score' or 'Proficiency Level' is empty/null/NaN
either_empty = merged_df[merged_df['Score'].isnull() | merged_df['Proficiency Level'].isnull()].shape[0]
print(f"Number of rows with either 'Score' or 'Proficiency Level' empty: {either_empty}")


Number of rows with both 'Score' and 'Proficiency Level' empty: 6568
Number of rows with 'Score' empty but 'Proficiency Level' not empty: 0
Number of rows with 'Score' not empty but 'Proficiency Level' empty: 0
Number of rows with either 'Score' or 'Proficiency Level' empty: 6568


In [27]:
# prompt: calculate the mean value of '% of time spent on task', '% of time spent on family', and 'Score' and feel up the empty space or the null/NaN values

# Calculate the mean of '% of time spent on task', '% of time spent on family', and 'Score', ignoring NaN values
mean_task = merged_df['% of time spent on task'].mean()
mean_family = merged_df['% of time spent on family'].mean()
mean_score = merged_df['Score'].mean()

# Fill NaN values in the respective columns with their means
merged_df['% of time spent on task'].fillna(mean_task, inplace=True)
merged_df['% of time spent on family'].fillna(mean_family, inplace=True)
merged_df['Score'].fillna(mean_score, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['% of time spent on task'].fillna(mean_task, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['% of time spent on family'].fillna(mean_family, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the in

In [28]:
empty_rows_per_column = merged_df.isnull().sum()

# Print the column names and the number of empty rows for each
for column, empty_count in empty_rows_per_column.items():
    print(f"Column '{column}': {empty_count} empty rows")

Column 'Occupation Type': 0 empty rows
Column 'ANZSCO Code': 0 empty rows
Column 'ANZSCO Title': 0 empty rows
Column 'Specialist Task': 0 empty rows
Column '% of time spent on task': 0 empty rows
Column 'Specialist Cluster': 0 empty rows
Column ' % of time spent on cluster': 6516 empty rows
Column 'Cluster Family': 0 empty rows
Column '% of time spent on family': 0 empty rows
Column 'Skills Statement': 0 empty rows
Column 'Core Competency': 6568 empty rows
Column 'Score': 0 empty rows
Column 'Proficiency Level': 6568 empty rows
Column 'Anchor Value': 6568 empty rows


In [33]:
# prompt: print 10 rows where core competency has empty value, also print 10 rows where core competency is not null

# Display first 10 rows where 'Core Competency' is empty
empty = merged_df[merged_df['Core Competency'].isnull()].head(10)
empty.head(10)
# Display first 10 rows where 'Core Competency' is not empty
#notempty = merged_df[merged_df['Core Competency'].notnull()].head(10)
#notempty.head(10)


Unnamed: 0,Occupation Type,ANZSCO Code,ANZSCO Title,Specialist Task,% of time spent on task,Specialist Cluster,% of time spent on cluster,Cluster Family,% of time spent on family,Skills Statement,Core Competency,Score,Proficiency Level,Anchor Value
2967,ANZSCO 4,3242,Vehicle Body Builders and Trimmers,Install vehicle parts or accessories,0.09929,"Fabricate, assemble or install components",0.117188,Production processes and machinery,0.484652,"Mount, attach, install, or integrate parts or ...",,5.377146,,
2968,ANZSCO 4,3242,Vehicle Body Builders and Trimmers,Install machine or equipment replacement parts,0.087191,Repair equipment and electronics,0.087191,Production processes and machinery,0.484652,Install or replace damaged or faulty machine o...,,5.377146,,
2969,ANZSCO 4,3242,Vehicle Body Builders and Trimmers,Replace or repair non-engine automotive or veh...,0.077492,Repair mechanical parts or equipment,0.077492,Production processes and machinery,0.484652,Repair or replace worn or defective vehicle co...,,5.377146,,
2970,ANZSCO 4,3242,Vehicle Body Builders and Trimmers,"Remove dents from equipment, materials tools o...",0.067793,Repair parts or components,0.067793,Production processes and machinery,0.484652,Remove dents or other deformations from equipm...,,5.377146,,
2971,ANZSCO 4,3242,Vehicle Body Builders and Trimmers,"Trim excess materials from work pieces, struct...",0.061694,Inspect or prepare finished products,0.116188,Quality control and inspections,0.176282,"Trim excess materials from work pieces, struct...",,5.377146,,
2972,ANZSCO 4,3242,Vehicle Body Builders and Trimmers,"Smooth surfaces of objects, structures, or equ...",0.054495,Inspect or prepare finished products,0.116188,Quality control and inspections,0.176282,"File, smooth or sand the surfaces of objects, ...",,5.377146,,
2973,ANZSCO 4,3242,Vehicle Body Builders and Trimmers,Repair or replace vehicle glass,0.047595,Cut or replace glass,0.047595,Construction,0.072893,Repair or replace vehicle glass by assessing d...,,5.377146,,
2974,ANZSCO 4,3242,Vehicle Body Builders and Trimmers,Remove parts or components from vehicles,0.045395,Remove or dismantle objects and equipment,0.045395,Material transportation,0.045395,Manually or with the assistance of power tools...,,5.377146,,
2975,ANZSCO 4,3242,Vehicle Body Builders and Trimmers,"Plan operational activities, procedures or seq...",0.039096,Plan and coordinate work operations,0.039096,Business operations and financial activities,0.039096,"Plan operational activities, procedures, or se...",,5.377146,,
2976,ANZSCO 4,3242,Vehicle Body Builders and Trimmers,Measure distances or dimensions,0.037496,"Measure or calculate dimensions, distance, vol...",0.037496,Science and mathematics,0.060194,Measure the distances or dimensions between ob...,,5.377146,,


In [34]:
# prompt: drop those rows where 'Core Competency','Anchor Value' and 'Core Competency' are null/NaN

# Drop rows where 'Core Competency', 'Anchor Value', and 'Core Competency' are null/NaN
merged_df = merged_df.dropna(subset=['Core Competency', 'Anchor Value', how='any')


In [35]:
# prompt: now show me the name of all columns and their number of NaN/ null values

empty_rows_per_column = merged_df.isnull().sum()

# Print the column names and the number of empty rows for each
for column, empty_count in empty_rows_per_column.items():
    print(f"Column '{column}': {empty_count} empty rows")


Column 'Occupation Type': 0 empty rows
Column 'ANZSCO Code': 0 empty rows
Column 'ANZSCO Title': 0 empty rows
Column 'Specialist Task': 0 empty rows
Column '% of time spent on task': 0 empty rows
Column 'Specialist Cluster': 0 empty rows
Column ' % of time spent on cluster': 0 empty rows
Column 'Cluster Family': 0 empty rows
Column '% of time spent on family': 0 empty rows
Column 'Skills Statement': 0 empty rows
Column 'Core Competency': 0 empty rows
Column 'Score': 0 empty rows
Column 'Proficiency Level': 0 empty rows
Column 'Anchor Value': 0 empty rows


In [36]:
merged_df.shape

(23882, 14)

In [3]:
# import pandas as pd

# # Perform initial merge
# merged_df = pd.merge(specialist_tasks_data_df, core_competencies_df,
#                     left_on=['Occupation Type', 'ANZSCO Title'],
#                     right_on=['Occupation Type', 'ANZSCO Title'],
#                     how='left', indicator=True)

# # Identify unmatched rows in specialist_tasks_data_df based on the original index
# unmatched_indices = specialist_tasks_data_df[merged_df['_merge'] == 'left_only'].index

# # Remove the indicator column
# merged_df = merged_df.drop(columns=['_merge'])

# # Create a list to store rows to be inserted
# rows_to_insert = []

# # Iterate through unmatched indices in specialist_tasks_data_df
# for index in unmatched_indices:
#     # Find matching rows in core_competencies_df based on 'Occupation Type' and 'ANZSCO Title'
#     matching_rows = core_competencies_df[(core_competencies_df['Occupation Type'] == specialist_tasks_data_df.loc[index, 'Occupation Type']) &
#                                          (core_competencies_df['ANZSCO Title'] == specialist_tasks_data_df.loc[index, 'ANZSCO Title'])]

#     # If a match is found, append the matching row to rows_to_insert
#     if not matching_rows.empty:
#         rows_to_insert.append(matching_rows.iloc[0]) # Take the first matching row if multiple exist

# # Create a DataFrame from rows_to_insert
# insert_df = pd.DataFrame(rows_to_insert)

# # Concatenate the original and insert DataFrames
# final_df = pd.concat([specialist_tasks_data_df, insert_df], ignore_index=True)

# final_df.head()

NameError: name 'specialist_tasks_data_df' is not defined

In [None]:
final_df.shape

In [None]:
# # prompt: show me name of the columns of final_df and their number of empty value

# # Display the number of empty values for each column in final_df
# for col in final_df.columns:
#     print(f"Column '{col}': {final_df[col].isnull().sum()} empty values")


In [37]:
# # prompt: for
# # Column '% of time spent on family': 6516 empty values
# # Column ' % of time spent on cluster': 6516 empty values
# # Column '% of time spent on task': 6516 empty values
# # show me the value type. and the percentage of empty value in the column

# # Display the number of empty values for each column in final_df
# # and the data type of each column
# for col in final_df.columns:
#     empty_count = final_df[col].isnull().sum()
#     empty_percentage = (empty_count / len(final_df)) * 100
#     print(f"Column '{col}': {empty_count} empty values ({empty_percentage:.2f}%),  data type: {final_df[col].dtype}")


NameError: name 'final_df' is not defined

In [38]:
# prompt: for these three column
# # Column '% of time spent on family': 6516 empty values
# # Column ' % of time spent on cluster': 6516 empty values
# # Column '% of time spent on task': 6516 empty values
# find out the median value and insert in the empty cells

# Calculate the median for each column, excluding NaN values
median_family_time = merged_df['% of time spent on family'].median()
median_cluster_time = merged_df[' % of time spent on cluster'].median()
median_task_time = merged_df['% of time spent on task'].median()

# Fill NaN values with the calculated medians
merged_df['% of time spent on family'] = merged_df['% of time spent on family'].fillna(median_family_time)
merged_df[' % of time spent on cluster'] = merged_df[' % of time spent on cluster'].fillna(median_cluster_time)
merged_df['% of time spent on task'] = merged_df['% of time spent on task'].fillna(median_task_time)

# Verify that there are no more empty values in these columns
print(f"Column '% of time spent on family': {merged_df['% of time spent on family'].isnull().sum()} empty values")
print(f"Column ' % of time spent on cluster': {merged_df[' % of time spent on cluster'].isnull().sum()} empty values")
print(f"Column '% of time spent on task': {merged_df['% of time spent on task'].isnull().sum()} empty values")


Column '% of time spent on family': 0 empty values
Column ' % of time spent on cluster': 0 empty values
Column '% of time spent on task': 0 empty values


In [39]:
merged_df.head()

Unnamed: 0,Occupation Type,ANZSCO Code,ANZSCO Title,Specialist Task,% of time spent on task,Specialist Cluster,% of time spent on cluster,Cluster Family,% of time spent on family,Skills Statement,Core Competency,Score,Proficiency Level,Anchor Value
0,ANZSCO 4,1111,Chief Executives and Managing Directors,Direct or manage financial activities or opera...,0.1302,"Manage, monitor and undertake financial activi...",0.1644,Business operations and financial activities,0.5322,Direct and oversee the financial operations of...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...
1,ANZSCO 4,1111,Chief Executives and Managing Directors,Direct department or organisational activities,0.1117,"Manage services, staff or activities",0.2128,Business operations and financial activities,0.5322,Direct and oversee the activities of a work un...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...
2,ANZSCO 4,1111,Chief Executives and Managing Directors,"Direct sales, marketing or customer service ac...",0.0808,"Manage services, staff or activities",0.2128,Business operations and financial activities,0.5322,"Direct and oversee the sales, marketing, or cu...",Digital engagement,6.0,Intermediate,Use software on a portable device to document ...
3,ANZSCO 4,1111,Chief Executives and Managing Directors,"Communicate with others to arrange, coordinate...",0.0665,Communicate or collaborate with others,0.075,Communication and collaboration,0.089,"Coordinate with others in order to plan, organ...",Digital engagement,6.0,Intermediate,Use software on a portable device to document ...
4,ANZSCO 4,1111,Chief Executives and Managing Directors,Analyse data to assess operational or project ...,0.0651,Use data to inform operational decisions,0.1009,"Data, analytics, and databases",0.137,Analyse qualitative and quantitative data aris...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...


In [None]:
# # prompt: drop 'Sub-Profile Code' and 'Emerging/Trending Flag' columns

# # Assuming 'final_df' is the DataFrame from the previous code

# # Drop the specified columns
# final_df = final_df.drop(columns=['Sub-Profile Code', 'Emerging/Trending Flag'], errors='ignore')

# # Display the updated DataFrame (optional)
# final_df.head()


In [None]:
# # prompt: number of unique 'ANZSCO Title' in 'final_df'

# num_unique_anzsco_titles = final_df['ANZSCO Title'].nunique()
# print(f"Number of unique ANZSCO Titles in 'final_df': {num_unique_anzsco_titles}")


In [None]:
# # prompt: show me number of rows along with the empty values and number of unique rows

# # Assuming 'final_df' is the DataFrame from the previous code

# # Display the number of rows in the DataFrame
# num_rows = len(final_df)
# print(f"Number of rows in the DataFrame: {num_rows}")

# # Display the number of empty values for each column
# for col in final_df.columns:
#     empty_count = final_df[col].isnull().sum()
#     print(f"Column '{col}': {empty_count} empty values")

# # Calculate and display the number of unique rows
# num_unique_rows = len(final_df.drop_duplicates())
# print(f"Number of unique rows in the DataFrame: {num_unique_rows}")


In [None]:
# # prompt: drop 'Emerging/\nTrending Flag' column

# # Assuming 'final_df' is the DataFrame from the previous code

# # Drop the specified column
# final_df = final_df.drop(columns=['Emerging/\nTrending Flag'], errors='ignore')

# # Display the updated DataFrame (optional)
# final_df.head()


In [40]:
merged_df.shape

(23882, 14)

In [41]:
num_rows = len(merged_df)
print(f"Number of rows in the DataFrame: {num_rows}")

# Display the number of empty values for each column
for col in merged_df.columns:
    empty_count = merged_df[col].isnull().sum()
    print(f"Column '{col}': {empty_count} empty values")

# Calculate and display the number of unique rows
num_unique_rows = len(merged_df.drop_duplicates())
print(f"Number of unique rows in the DataFrame: {num_unique_rows}")


Number of rows in the DataFrame: 23882
Column 'Occupation Type': 0 empty values
Column 'ANZSCO Code': 0 empty values
Column 'ANZSCO Title': 0 empty values
Column 'Specialist Task': 0 empty values
Column '% of time spent on task': 0 empty values
Column 'Specialist Cluster': 0 empty values
Column ' % of time spent on cluster': 0 empty values
Column 'Cluster Family': 0 empty values
Column '% of time spent on family': 0 empty values
Column 'Skills Statement': 0 empty values
Column 'Core Competency': 0 empty values
Column 'Score': 0 empty values
Column 'Proficiency Level': 0 empty values
Column 'Anchor Value': 0 empty values
Number of unique rows in the DataFrame: 23882


In [42]:
# prompt: show me the columns name and their amount of empty value of 'technology_tools_data_df

# Assuming 'technology_tools_data_df' is defined in your previous code.
# If not, replace it with the actual name of the DataFrame.

# Display the number of empty values for each column in technology_tools_df
for col in technology_tools_df.columns:
    empty_count = technology_tools_df[col].isnull().sum()
    print(f"Column '{col}': {empty_count} empty values")


Column 'Occupation Type': 0 empty values
Column 'ANZSCO Code': 0 empty values
Column 'Sub-Profile Code': 4461 empty values
Column 'ANZSCO Title': 0 empty values
Column 'Technology Tool': 0 empty values
Column 'Emerging/Trending Flag': 5320 empty values


In [None]:
# prompt: drop the 'Emerging/Trending Flag'  and 'Sub-Profile Code' from 'technology_tools_df'

# Drop the specified columns from technology_tools_df
technology_tools_df = technology_tools_df.drop(columns=['Emerging/Trending Flag', 'Sub-Profile Code'], errors='ignore')

# Display the updated DataFrame (optional)
technology_tools_df.head()


In [53]:
# prompt: show number of rows in technology_tools_df

print(len(technology_tools_df))


5761


In [47]:
import pandas as pd

# Load the dataframes if not already loaded
# Assuming final_df and technology_tools_df are defined with the provided structure

# Aggregate technology_tools_df by concatenating 'Technology Tool' entries for each unique key pair
technology_tools_aggregated = technology_tools_df.groupby(['Occupation Type', 'ANZSCO Title'])['Technology Tool'].apply(', '.join).reset_index()

# Merge final_df with this aggregated dataframe
merged_df = pd.merge(merged_df, technology_tools_aggregated, on=['Occupation Type', 'ANZSCO Title'], how='left')

# Now, merged_df will have an additional column 'Technology Tool' from technology_tools_df
# This column contains concatenated strings of tools, ensuring no increase in row count

# Display the head of the merged dataframe to verify
merged_df.head()


Unnamed: 0,Occupation Type,ANZSCO Code,ANZSCO Title,Specialist Task,% of time spent on task,Specialist Cluster,% of time spent on cluster,Cluster Family,% of time spent on family,Skills Statement,Core Competency,Score,Proficiency Level,Anchor Value,ANZSCO Description,Technology Tool_x,Technology Tool_y,Technology Tool
0,ANZSCO 4,1111,Chief Executives and Managing Directors,Direct or manage financial activities or opera...,0.1302,"Manage, monitor and undertake financial activi...",0.1644,Business operations and financial activities,0.5322,Direct and oversee the financial operations of...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,Chief Executives and Managing Directors determ...,"Accounting and financial management systems, A...","Accounting and financial management systems, A...","Accounting and financial management systems, A..."
1,ANZSCO 4,1111,Chief Executives and Managing Directors,Direct department or organisational activities,0.1117,"Manage services, staff or activities",0.2128,Business operations and financial activities,0.5322,Direct and oversee the activities of a work un...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,Chief Executives and Managing Directors determ...,"Accounting and financial management systems, A...","Accounting and financial management systems, A...","Accounting and financial management systems, A..."
2,ANZSCO 4,1111,Chief Executives and Managing Directors,"Direct sales, marketing or customer service ac...",0.0808,"Manage services, staff or activities",0.2128,Business operations and financial activities,0.5322,"Direct and oversee the sales, marketing, or cu...",Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,Chief Executives and Managing Directors determ...,"Accounting and financial management systems, A...","Accounting and financial management systems, A...","Accounting and financial management systems, A..."
3,ANZSCO 4,1111,Chief Executives and Managing Directors,"Communicate with others to arrange, coordinate...",0.0665,Communicate or collaborate with others,0.075,Communication and collaboration,0.089,"Coordinate with others in order to plan, organ...",Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,Chief Executives and Managing Directors determ...,"Accounting and financial management systems, A...","Accounting and financial management systems, A...","Accounting and financial management systems, A..."
4,ANZSCO 4,1111,Chief Executives and Managing Directors,Analyse data to assess operational or project ...,0.0651,Use data to inform operational decisions,0.1009,"Data, analytics, and databases",0.137,Analyse qualitative and quantitative data aris...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,Chief Executives and Managing Directors determ...,"Accounting and financial management systems, A...","Accounting and financial management systems, A...","Accounting and financial management systems, A..."


In [54]:
# prompt: drop 'Technology Tool_y', 'Technology Tool_x'  from merged_df

try:
    merged_df = merged_df.drop(columns=['Technology Tool_y', 'Technology Tool_x'])
except KeyError:
    print("Columns 'Technology Tool_y' or 'Technology Tool_x' not found in 'merged_df'")


In [55]:
merged_df.shape
merged_df.head()

Unnamed: 0,Occupation Type,ANZSCO Code,ANZSCO Title,Specialist Task,% of time spent on task,Specialist Cluster,% of time spent on cluster,Cluster Family,% of time spent on family,Skills Statement,Core Competency,Score,Proficiency Level,Anchor Value,ANZSCO Description_x,Technology Tool,ANZSCO Description_y,ANZSCO Description
0,ANZSCO 4,1111,Chief Executives and Managing Directors,Direct or manage financial activities or opera...,0.1302,"Manage, monitor and undertake financial activi...",0.1644,Business operations and financial activities,0.5322,Direct and oversee the financial operations of...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,Chief Executives and Managing Directors determ...,"Accounting and financial management systems, A...",Chief Executives and Managing Directors determ...,Chief Executives and Managing Directors determ...
1,ANZSCO 4,1111,Chief Executives and Managing Directors,Direct department or organisational activities,0.1117,"Manage services, staff or activities",0.2128,Business operations and financial activities,0.5322,Direct and oversee the activities of a work un...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,Chief Executives and Managing Directors determ...,"Accounting and financial management systems, A...",Chief Executives and Managing Directors determ...,Chief Executives and Managing Directors determ...
2,ANZSCO 4,1111,Chief Executives and Managing Directors,"Direct sales, marketing or customer service ac...",0.0808,"Manage services, staff or activities",0.2128,Business operations and financial activities,0.5322,"Direct and oversee the sales, marketing, or cu...",Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,Chief Executives and Managing Directors determ...,"Accounting and financial management systems, A...",Chief Executives and Managing Directors determ...,Chief Executives and Managing Directors determ...
3,ANZSCO 4,1111,Chief Executives and Managing Directors,"Communicate with others to arrange, coordinate...",0.0665,Communicate or collaborate with others,0.075,Communication and collaboration,0.089,"Coordinate with others in order to plan, organ...",Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,Chief Executives and Managing Directors determ...,"Accounting and financial management systems, A...",Chief Executives and Managing Directors determ...,Chief Executives and Managing Directors determ...
4,ANZSCO 4,1111,Chief Executives and Managing Directors,Analyse data to assess operational or project ...,0.0651,Use data to inform operational decisions,0.1009,"Data, analytics, and databases",0.137,Analyse qualitative and quantitative data aris...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,Chief Executives and Managing Directors determ...,"Accounting and financial management systems, A...",Chief Executives and Managing Directors determ...,Chief Executives and Managing Directors determ...


In [None]:
# prompt: drop ANZSCO Description_y and ANZSCO Description_x from merged_df

try:
    merged_df = merged_df.drop(columns=['ANZSCO Description_y', 'ANZSCO Description_x'])
except KeyError:
    print("Columns 'ANZSCO Description_y' or 'ANZSCO Description_x' not found in 'merged_df'")


In [57]:
merged_df = merged_df.drop(columns=['ANZSCO Description_y', 'ANZSCO Description_x'])
merged_df.head()



Unnamed: 0,Occupation Type,ANZSCO Code,ANZSCO Title,Specialist Task,% of time spent on task,Specialist Cluster,% of time spent on cluster,Cluster Family,% of time spent on family,Skills Statement,Core Competency,Score,Proficiency Level,Anchor Value,Technology Tool,ANZSCO Description
0,ANZSCO 4,1111,Chief Executives and Managing Directors,Direct or manage financial activities or opera...,0.1302,"Manage, monitor and undertake financial activi...",0.1644,Business operations and financial activities,0.5322,Direct and oversee the financial operations of...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,"Accounting and financial management systems, A...",Chief Executives and Managing Directors determ...
1,ANZSCO 4,1111,Chief Executives and Managing Directors,Direct department or organisational activities,0.1117,"Manage services, staff or activities",0.2128,Business operations and financial activities,0.5322,Direct and oversee the activities of a work un...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,"Accounting and financial management systems, A...",Chief Executives and Managing Directors determ...
2,ANZSCO 4,1111,Chief Executives and Managing Directors,"Direct sales, marketing or customer service ac...",0.0808,"Manage services, staff or activities",0.2128,Business operations and financial activities,0.5322,"Direct and oversee the sales, marketing, or cu...",Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,"Accounting and financial management systems, A...",Chief Executives and Managing Directors determ...
3,ANZSCO 4,1111,Chief Executives and Managing Directors,"Communicate with others to arrange, coordinate...",0.0665,Communicate or collaborate with others,0.075,Communication and collaboration,0.089,"Coordinate with others in order to plan, organ...",Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,"Accounting and financial management systems, A...",Chief Executives and Managing Directors determ...
4,ANZSCO 4,1111,Chief Executives and Managing Directors,Analyse data to assess operational or project ...,0.0651,Use data to inform operational decisions,0.1009,"Data, analytics, and databases",0.137,Analyse qualitative and quantitative data aris...,Digital engagement,6.0,Intermediate,Use software on a portable device to document ...,"Accounting and financial management systems, A...",Chief Executives and Managing Directors determ...


In [None]:
merged_df.shape

In [None]:
print(tech_tools_hierarchy_df.head())


In [None]:
tech_tools_hierarchy_df.info()


In [None]:
tech_tools_hierarchy_df.shape

In [None]:
# Assuming merged_df and tech_tools_hierarchy_df are defined and loaded as per your descriptions

# Optional: Clean/prepare the 'Technology Tool' columns if necessary (e.g., strip spaces, convert to same case)
merged_df['Technology Tool'] = merged_df['Technology Tool'].str.strip().str.lower()
tech_tools_hierarchy_df['Technology Tool'] = tech_tools_hierarchy_df['Technology Tool'].str.strip().str.lower()

# Perform the merge operation
merged_df = pd.merge(merged_df, tech_tools_hierarchy_df[['Technology Tool', 'Technology Tool Description', 'Technology Tool Extended Description', 'Technology Tool Category', 'Technology Tool Category Description']],
                     on='Technology Tool',
                     how='left')

# Print the first few rows to verify the new structure and content
print(merged_df.head())

# Check the number of rows to ensure they haven't increased
print("Number of rows in the merged DataFrame:", len(merged_df))

In [None]:
# prompt: show number of unique of rows

# Calculate and display the number of unique rows in merged_df
num_unique_rows_merged = len(merged_df.drop_duplicates())
print(f"Number of unique rows in 'merged_df': {num_unique_rows_merged}")


In [None]:
# prompt: data size of merged_df in mb or kb

import pandas as pd

# Assuming merged_df is already defined from the previous code

# Calculate memory usage of merged_df in MB
memory_usage_mb = merged_df.memory_usage(deep=True).sum() / (1024 * 1024)

print(f"Memory usage of merged_df: {memory_usage_mb:.2f} MB")


In [None]:
# prompt: show name of all columns for core_competencies_df

print(core_competencies_df.columns.tolist())


In [None]:
print(core_competency_descriptions_df.columns.tolist())

In [None]:
merged_df.columns.tolist()