In [1]:
import pandas as pd
import time
import numpy as np
from itertools import combinations
import random

In [2]:
df = pd.read_excel('adult_census.xlsx')
'''It appears that 'fnlwgt' column is something similar to an ID, hence we can drop it.
   Moreover, I feel that the 'education-num' column too can be discarded.
'''
df.drop(['fnlwgt', 'education-num'], axis=1, inplace=True)
df.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
''' The data set has many missing values represented by '?' this might
    create erroneous patterns when we find association rules. Hence, let
    us drop all the rows containing ' ?'
'''
df = df.replace(' ?', np.NaN)
df.dropna(axis = 0, inplace = True)
df.reset_index(drop= True, inplace= True)
df.head(50)

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [5]:
'''To apply Apriori algorithm we have to transform our cross-sectional data into a transactional one.
   In order to achieve this we can treat each tuple or row as a transaction but this mandates that all
   the columns must have cateogorical data for we are interested in generating association rules from a
   limited pool of items appearing in the transaction. To achieve this I have grouped continuous data into
   discrete bins.
'''

df['age'] = pd.cut(df['age'], bins = [0, 25, 50, 100],
                   labels = ['Young', 'Middle_age', 'Old'],
                   right = True, include_lowest = True)

df['hours-per-week'] = pd.cut(df['hours-per-week'], [0, 30, 50, 100],
                              labels = ['Part_time', 'Full_time', 'Extra_time'],
                              right = True, include_lowest = True)

df['capital-gain'] = pd.cut(df['capital-gain'], [0, 1, 50000, 100000],
                            labels = ['No_Gain', 'Medium_Gain', 'High_Gain'],
                            right = True, include_lowest = True)

df['capital-loss'] = pd.cut(df['capital-loss'], [0, 1, 50000, 100000],
                            labels = ['No_Loss', 'Medium_loss', 'High_Loss'],
                            right = True, include_lowest = True)
df.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,Middle_age,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,Medium_Gain,No_Loss,Full_time,United-States,<=50K
1,Middle_age,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,No_Gain,No_Loss,Part_time,United-States,<=50K
2,Middle_age,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,No_Gain,No_Loss,Full_time,United-States,<=50K
3,Old,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,No_Gain,No_Loss,Full_time,United-States,<=50K
4,Middle_age,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,No_Gain,No_Loss,Full_time,Cuba,<=50K


In [6]:
def get_candidates_1(datafrm):

    temp_candidates = {}

    for index, row in datafrm.iterrows():
        # Converting each row of adult census data into a transaction
        transaction = list(set(row))
        ''' Each element in a transaction is an item, the count of each item is maintained
            as key-value pair in a dictionary
        '''
        for item in transaction:
            if (item in temp_candidates.keys()):
                temp_candidates[item] += 1
            else:
                temp_candidates[item] = 1
    return temp_candidates

In [7]:
def generate_L1(d, c, minSupport):

    for key,value in c.items():
        if (value >= minSupport):
            d[key] = value
    return d

In [8]:
def join(frequentItemsets, k):
    cand_sets = []
    total_items = list(frequentItemsets.keys())
    singleUniqueElements = set()
    if (k>2):
        for item in set(total_items):
            singleUniqueElements = singleUniqueElements.union(item)
    else:
        singleUniqueElements = set(total_items)

    cand_sets = list(combinations(singleUniqueElements, k))

    return cand_sets

In [9]:
def has_infrequent_subset(item, prevFreqSet, k):
    
    candSubsets = set(combinations(item,(k-1)))
    
    if (not(candSubsets.issubset(prevFreqSet))):
        return True

    return False

def compute_support_count(candidateItemsets,dataFrm):
    
    candidateSets = {key:0 for key in candidateItemsets}
    
    for key,value in candidateSets.items():
        for index,row in dataFrm.iterrows():
            if(set(key).issubset(set(row))):
                candidateSets[key] += 1
                
    return candidateSets

def prune_itemsets(k, candidateItemsets, previousFrequentItemsets, minSupport, dataFrm):
    
    candidates ={}
    prevFreqSet = set(previousFrequentItemsets)

    for item in candidateItemsets:
        if(has_infrequent_subset(item, prevFreqSet, k)):
            candidateItemsets.remove(item)

    candidates = compute_support_count(candidateItemsets,dataFrm)
    print("Candidates after pruning:\n", candidates)
    
    return candidates

In [10]:
def generate_lk(candidateItemsets, minSupport):
    
    frequentItemsets = {}
    
    for item,count in candidateItemsets.items():
        if (count >= minSupport):
            frequentItemsets[item] = count

    return frequentItemsets

In [11]:
def perform_apriori(dataFrm, minSupportPercentage):

    minSupport = round((minSupportPercentage / 100.0) * len(dataFrm.index))
    print ('Minimum Support Count = ', minSupport)
    
    # Generating the 1st set of candidates C1
    candidateItemsets = get_candidates_1(dataFrm)
    
    # Generating 1st set of frequent itemsets L1
    frequentItemsets = {}
    frequentItemsets = generate_L1(frequentItemsets, candidateItemsets, minSupport)

    #totalFrequentItemsets = len(frequentItemsets)
    iteration = 1
    
    finalFrequentItemsets = {}
    finalFrequentItemsets.update(frequentItemsets)

    print('Set of frequent itemsets L1 : \n')
    for key,value in frequentItemsets.items():
        print(key, ' : ', value)
    
    while True:
        iteration += 1
        candidateItemsets = join(frequentItemsets, iteration)
        candidateItemsets = prune_itemsets(iteration, candidateItemsets,
                                          frequentItemsets, minSupport, dataFrm)
        frequentItemsets = generate_lk(candidateItemsets, minSupport)
        finalFrequentItemsets.update(frequentItemsets)
        
        if (bool(frequentItemsets)):
            print('Set of frequent itemsets L', iteration, ': \n')
            for key,value in frequentItemsets.items():
                print(key, ' : ', value, '\n')


        if len(frequentItemsets) == 0:
#             print('All frequent patterns: \n')
#             for key,value in finalFrequentItemsets.items():
#                 print(key, ' : ', value)
            break
    return

In [18]:
startTime = time.time()
perform_apriori(df 60)
totalRunTime = time.time() - startTime
print ('\n Total Execution Time: ', totalRunTime)

Minimum Support Count =  3
Set of frequent itemsets L1 : 

E  :  4
K  :  5
Y  :  3
M  :  3
O  :  3
Candidates after pruning:
 {('E', 'Y'): 2, ('E', 'O'): 3, ('K', 'M'): 3, ('Y', 'M'): 2, ('M', 'O'): 1}
Set of frequent itemsets L 2 : 

('E', 'O')  :  3 

('K', 'M')  :  3 

Candidates after pruning:
 {('O', 'K', 'E'): 3, ('K', 'M', 'E'): 2}
Set of frequent itemsets L 3 : 

('O', 'K', 'E')  :  3 

Candidates after pruning:
 {}

 Total Execution Time:  0.003990650177001953


In [14]:
#Improved Apriori
def sampled_apriori(df, minSup, samplingFactor):
    
    #Perform a random sample of the dataframe
    df = df.sample(frac = samplingFactor).reset_index(drop = True)
    
    # We lower the minimum support in hope of capturing all global frequent patterns
    minSup = minSup*0.9
    
    startTime = time.time()
    perform_apriori(df, minSup)
    totalRunTime = time.time() - startTime
    print ('\n\n Total Execution Time: ', totalRunTime)

In [15]:
sampled_apriori(df,70,0.5)

Minimum Support Count =  9501
Set of frequent itemsets L1 : 

No_Gain  :  13830
 Private  :  11148
 United-States  :  13749
 <=50K  :  11320
 Male  :  10127
No_Loss  :  14351
Full_time  :  11118
 White  :  12983
Candidates after pruning:
 {('No_Gain', ' Private'): 10299, ('No_Gain', ' Male'): 9142, ('No_Gain', 'Full_time'): 10185, (' United-States', ' Private'): 10064, (' United-States', ' Male'): 9233, (' United-States', 'Full_time'): 10086, (' Private', ' <=50K'): 8671, (' Private', 'No_Loss'): 10647, (' Private', ' White'): 9610, (' <=50K', 'No_Loss'): 10957, (' <=50K', ' White'): 9587, (' Male', 'Full_time'): 7601, ('No_Loss', 'Full_time'): 10556, ('Full_time', ' White'): 9493}
Set of frequent itemsets L 2 : 

('No_Gain', ' Private')  :  10299 

('No_Gain', 'Full_time')  :  10185 

(' United-States', ' Private')  :  10064 

(' United-States', 'Full_time')  :  10086 

(' Private', 'No_Loss')  :  10647 

(' Private', ' White')  :  9610 

(' <=50K', 'No_Loss')  :  10957 

(' <=50K', '

In [16]:
'''
################TESTING BLOCK##############
'''
dic_2 = { 1: ['M', 'O', 'N', 'K', 'E', 'Y'],
        2: ['D', 'O', 'N', 'K', 'E', 'Y'],
        3: ['M', 'A', 'K', 'E', '#', '$'],
        4: ['M', 'U', 'C', 'K', 'Y', '%'],
        5: ['C', 'O', 'O', 'K', 'I', 'E']
}
data_test = pd.DataFrame(dic_2)
data_test= data_test.T
data_test.reset_index(drop = True)
'''
###########################################
'''

'\n###########################################\n'