# Assocation Rule Mining - FP Growth

In [1]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Dataset

df = pd.read_csv('https://raw.githubusercontent.com/renatomaaliw3/public_files/refs/heads/master/Data%20Sets/software_developer.csv')
df

Unnamed: 0,Age,Years_of_Experience,Degree,Coding_Test_Score,Interview_Score,Portfolio_Project_Count,Fullstack_Knowledge,Preferred_WE,Certifications,Previous_Job_Level,Leadership_Experience,Hackathon_Participation,Attitude_Evaluation,Status
0,41-50,3-5 years,No Degree,Code_Medium,Interview_Medium,Many Projects,Full_No,On-site,GCP,Prev_Lead,Lead_Yes,Hack_No,ATT_Positive,STATUS_Hired
1,20-30,3-5 years,Master's,Code_Medium,Interview_Low,No Projects,Full_No,Hybrid,GCP,Prev_Mid,Lead_Yes,Hack_No,ATT_Neutral,STATUS_Hired
2,20-30,10+ years,Bachelor's,Code_Medium,Interview_Medium,Few Projects,Full_Yes,On-site,AWS,Prev_Lead,Lead_Yes,Hack_Yes,ATT_Positive,STATUS_Hired
3,20-30,10+ years,No Degree,Code_Medium,Interview_High,Few Projects,Full_No,On-site,Azure,Prev_Mid,Lead_Yes,Hack_Yes,ATT_Positive,STATUS_Hired
4,41-50,10+ years,PhD,Code_Medium,Interview_Low,Few Projects,Full_Yes,Hybrid,Azure,Prev_Mid,Lead_No,Hack_No,ATT_Positive,STATUS_Hired
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,41-50,10+ years,No Degree,Code_High,Interview_Medium,Few Projects,Full_Yes,Remote,GCP,Prev_Junior,Lead_Yes,Hack_Yes,ATT_Positive,STATUS_Not_Hired
996,41-50,3-5 years,PhD,Code_High,Interview_Medium,Few Projects,Full_No,On-site,GCP,Prev_Junior,Lead_Yes,Hack_No,ATT_Negative,STATUS_Not_Hired
997,31-40,0-2 years,PhD,Code_High,Interview_Medium,Many Projects,Full_No,On-site,AWS,Prev_Lead,Lead_Yes,Hack_Yes,ATT_Neutral,STATUS_Not_Hired
998,41-50,0-2 years,Master's,Code_Low,Interview_Medium,Few Projects,Full_Yes,Remote,No Certifications,Prev_Junior,Lead_Yes,Hack_Yes,ATT_Neutral,STATUS_Not_Hired


In [3]:
# Data Preprocessing
# Before Applying the FPGrowth Algorithm, we need to preprocess the data
# One-Hot Encoding, Remember get dummies?

from mlxtend.preprocessing import TransactionEncoder

# Consolidate each transaction into a single list of items, removing NaN values
transactions = df.apply(lambda row: row.dropna().tolist(), axis = 1).tolist()

# Initialize TransactionEncoder
encoder = TransactionEncoder()

# Fit and transform the transactions data
transaction_matrix = encoder.fit_transform(transactions)

# Convert to DataFrame
transaction_df = pd.DataFrame(transaction_matrix, columns = encoder.columns_)
transaction_df

# Hired Employees Only

status_employees = transaction_df.drop('STATUS_Not_Hired', axis = 1)
status_employees = status_employees[status_employees['STATUS_Hired'] == True]
status_employees = status_employees.drop('STATUS_Hired', axis = 1)
status_employees = status_employees.loc[:, :'Remote']
status_employees

# Not Hired Employees Only

# status_employees = transaction_df.drop('STATUS_Hired', axis = 1)
# status_employees = status_employees[status_employees['STATUS_Not_Hired'] == True]
# status_employees = status_employees.drop('STATUS_Not_Hired, axis = 1)
# status_employees = status_employees.loc[:, :'Remote']
# status_employees

Unnamed: 0,0-2 years,10+ years,20-30,3-5 years,31-40,41-50,6-10 years,ATT_Negative,ATT_Neutral,ATT_Positive,...,No Certifications,No Degree,No Projects,On-site,PhD,Prev_Junior,Prev_Lead,Prev_Mid,Prev_Senior,Remote
0,False,False,False,True,False,True,False,False,False,True,...,False,True,False,True,False,False,True,False,False,False
1,False,False,True,True,False,False,False,False,True,False,...,False,False,True,False,False,False,False,True,False,False
2,False,True,True,False,False,False,False,False,False,True,...,False,False,False,True,False,False,True,False,False,False
3,False,True,True,False,False,False,False,False,False,True,...,False,True,False,True,False,False,False,True,False,False
4,False,True,False,False,False,True,False,False,False,True,...,False,False,False,False,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,False,False,True,True,False,False,False,False,False,True,...,False,False,False,False,False,False,False,True,False,True
496,False,False,False,True,False,True,False,False,False,True,...,True,False,False,True,False,True,False,False,False,False
497,False,False,False,True,False,True,False,False,False,True,...,False,False,False,False,True,False,True,False,False,False
498,True,False,False,False,False,True,False,False,False,True,...,True,False,False,False,True,False,True,False,False,True


In [4]:
# Appying the FPGrowth Algorithm
# Since data are cleaned and prepared for frequent itemset

from mlxtend.frequent_patterns import fpgrowth, association_rules

# Apply the FPGrowth Algorithm
frequent_itemsets = fpgrowth(status_employees, min_support = 0.3, use_colnames = True)

# min_support is the minimum support threshold. Itemsets with support greater than or equal to this threshold will be returned.
# use_colnames = True ensures that the item names are used in the output instead of column indices.

In [5]:
# View Frequent Itemsets

print(frequent_itemsets)

    support                          itemsets
0     0.746                    (ATT_Positive)
1     0.532                         (Full_No)
2     0.506                        (Lead_Yes)
3     0.490                         (Hack_No)
4     0.424                   (Many Projects)
5     0.416                     (Code_Medium)
6     0.392                (Interview_Medium)
7     0.364                           (41-50)
8     0.346                         (On-site)
9     0.326                          (Hybrid)
10    0.326                           (20-30)
11    0.510                        (Hack_Yes)
12    0.468                        (Full_Yes)
13    0.468                    (Few Projects)
14    0.352                       (10+ years)
15    0.402                  (Interview_High)
16    0.494                         (Lead_No)
17    0.400                       (Code_High)
18    0.310                           (31-40)
19    0.328                          (Remote)
20    0.406           (ATT_Positiv

  and should_run_async(code)


In [6]:
# Generate Association Rules

rules = association_rules(frequent_itemsets, num_itemsets = len(status_employees), metric = "confidence", min_threshold = 0.3)
rules.loc[:, :'lift']

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
0,(ATT_Positive),(Full_No),0.746,0.532,0.406,0.544236,1.023
1,(Full_No),(ATT_Positive),0.532,0.746,0.406,0.763158,1.023
2,(ATT_Positive),(Lead_Yes),0.746,0.506,0.348,0.466488,0.921913
3,(Lead_Yes),(ATT_Positive),0.506,0.746,0.348,0.687747,0.921913
4,(Hack_No),(ATT_Positive),0.49,0.746,0.364,0.742857,0.995787
5,(ATT_Positive),(Hack_No),0.746,0.49,0.364,0.487936,0.995787
6,(Many Projects),(ATT_Positive),0.424,0.746,0.34,0.801887,1.074915
7,(ATT_Positive),(Many Projects),0.746,0.424,0.34,0.455764,1.074915
8,(Interview_Medium),(ATT_Positive),0.392,0.746,0.312,0.795918,1.066915
9,(ATT_Positive),(Interview_Medium),0.746,0.392,0.312,0.418231,1.066915


### Hired


Rule 0: (ATT_Positive) → (Full_No)

Support: 40.6% of transactions contain both ATT_Positive and Full_No.
Confidence: 54.42% of cases with ATT_Positive lead to Full_No.
Lift: 1.023 indicates a very slight positive association between ATT_Positive and Full_No.
Rule 2: (ATT_Positive) → (Lead_Yes)

Support: 34.8% contain both ATT_Positive and Lead_Yes.
Confidence: 46.65% of ATT_Positive cases lead to Lead_Yes.
Lift: 0.921 suggests a slightly negative association, meaning ATT_Positive slightly decreases the likelihood of Lead_Yes.
Rule 4: (ATT_Positive) → (Hack_No)

Support: 36.4% contain both ATT_Positive and Hack_No.
Confidence: 48.79%, and Lift of 0.995 suggests a nearly neutral relationship.
Rule 6: (ATT_Positive) → (Many Projects)

Support: 34% contain both.
Confidence: 45.58%, with Lift 1.075 suggesting a moderate positive association.
Rule 8: (Interview_Medium) → (ATT_Positive)

Support: 31.2% of transactions involve both Interview_Medium and ATT_Positive.
Confidence: 79.59%, indicating Interview_Medium is strongly predictive of ATT_Positive.
Lift: 1.067 shows a moderately positive relationship.
Rule 12: (Hack_Yes) → (Full_No)

Support: 32%.
Confidence: 62.75% of cases with Hack_Yes lead to Full_No.
Lift: 1.179 suggests a stronger positive association between Hack_Yes and Full_No.
Rule 18: (ATT_Positive) → (Lead_No)

Support: 39.8%.
Confidence: 53.35%, and Lift of 1.08 indicates a moderate positive association.


In [7]:
# Conclusion

# The patterns indicate that a positive attitude is not solely dependent on technical qualifications.
# Both strong performers and average programmers can receive favorable evaluations if they demonstrate
# desirable traits like practical project experience or a balanced skill set without full-stack knowledge.
# The findings also emphasize that participation in hackathons, coding scores, and attitude evaluations
# are positively linked, suggesting that a focus on personal projects and coding ability may predict a
# positive attitude, making candidates attractive to top companies even with average technical qualifications.

  and should_run_async(code)


In [8]:
# Conclusion

# Leadership and Technical Skill Misalignment: Candidates with leadership experience may
# need to excel in specific technical evaluations or projects to compensate for perceived gaps in other areas.

# Insufficient Project-Based Experience: High coding skills without a breadth of project
# experience seem less compelling to hiring managers, suggesting the importance of applied,
# project-based evidence of skills.

# Lack of Key Skills and Experience: Candidates lacking both full-stack knowledge and leadership experience
# are at a disadvantage, even with other positive attributes like hackathon participation.

  and should_run_async(code)
