## Implementing Apriori Algorithm for Census Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as mlt
from apyori import apriori

In [2]:
census_data = pd.read_csv("D:\Data Science\Internship\Assignment\Problem\Problem1\census.csv", header = None)
num_records = len(census_data)
print(num_records)

30162


We can transform it into the right format i.e. a DF

In [3]:
records = []
for i in range(0, num_records):
    records.append([str(census_data.values[i,j]) for j in range(0, 12)])

In [4]:
records[0]

['age=Middle-aged',
 'sex=Male',
 'education=Bachelors',
 'native-country=United-States',
 'race=White',
 'marital-status=Never-married',
 'workclass=State-gov',
 'occupation=Adm-clerical',
 'hours-per-week=Full-time',
 'income=Small',
 'capital-gain=Low',
 'capital-loss=None']

In [5]:
association_rules = apriori(records, min_support = 0.3, min_confidence = 0.2, min_lift = 1, min_length = 2, max_length = 2)
association_results = list(association_rules)

In [6]:
print(len(association_results))

37


In [7]:
print(association_results[36])

RelationRecord(items=frozenset({'sex=Male', 'race=White'}), support=0.5980372654333267, ordered_statistics=[OrderedStatistic(items_base=frozenset({'race=White'}), items_add=frozenset({'sex=Male'}), confidence=0.6955616396097636, lift=1.0294175747747638), OrderedStatistic(items_base=frozenset({'sex=Male'}), items_add=frozenset({'race=White'}), confidence=0.8850834151128557, lift=1.029417574774764)])


In [8]:
results = []

for item in association_results[13:36]:
    pair = item[0]
    items = [x for x in pair]
    
    value0 = str(items[0])
    value1 = str(items[1])
    
    value2 = str(item[1]) #to convert into object
    
    value3 = str(item[2][0][2])
    value4 = str(item[2][0][3])
    
    rows = (value0, value1, value2, value3, value4)
    results.append(rows)
    
labels = ['Title 1', 'Title 2', 'Support', 'Confidence', 'Lift']

census = pd.DataFrame.from_records(results, columns = labels)

In [9]:
census

Unnamed: 0,Title 1,Title 2,Support,Confidence,Lift
0,age=Middle-aged,hours-per-week=Full-time,0.3100590146542006,0.5935893367185021,1.011287933467209
1,age=Middle-aged,sex=Male,0.3638684437371527,0.6966042526182165,1.0309606215638196
2,age=Middle-aged,workclass=Private,0.3887673231218089,0.7442716597905427,1.007301525738237
3,education=HS-grad,capital-gain=None,0.3049864067369537,0.3330075296843324,1.0207492998311825
4,hours-per-week=Full-time,capital-gain=None,0.5445925336516146,0.5946278598320301,1.0130572474160469
5,income=Small,capital-gain=None,0.7198130097473643,0.7859470026064292,1.0464259509408986
6,marital-status=Never-married,capital-gain=None,0.3083349910483389,0.3366637706342311,1.0440522979508202
7,capital-gain=None,sex=Female,0.3057158013394337,0.3338039386041123,1.0292572476157469
8,workclass=Private,capital-gain=None,0.6824149592202109,0.745112945264987,1.0084401263161866
9,education=HS-grad,capital-loss=None,0.3136396790663749,0.3292152427353402,1.0091250153844848


<b>Sorting dataframe by Confidence

In [10]:
census.sort_values('Confidence', ascending = False, inplace = True)

In [11]:
census[['Lift', 'Confidence', 'Support']] = census[['Lift', 'Confidence', 'Support']].astype(float)

In [12]:
census

Unnamed: 0,Title 1,Title 2,Support,Confidence,Lift
15,native-country=United-States,education=HS-grad,0.305318,0.935874,1.026317
20,race=White,marital-status=Married-civ-spouse,0.419601,0.899822,1.04656
21,sex=Male,marital-status=Married-civ-spouse,0.417247,0.894774,1.324248
22,native-country=United-States,race=White,0.802931,0.880526,1.024118
16,income=Small,hours-per-week=Full-time,0.469001,0.799028,1.063843
5,income=Small,capital-gain=None,0.719813,0.785947,1.046426
19,income=Small,workclass=Private,0.577216,0.768518,1.040116
11,income=Small,capital-loss=None,0.728201,0.764364,1.01769
17,hours-per-week=Full-time,workclass=Private,0.447152,0.761805,1.031032
8,workclass=Private,capital-gain=None,0.682415,0.745113,1.00844


In [13]:
census.dtypes

Title 1        object
Title 2        object
Support       float64
Confidence    float64
Lift          float64
dtype: object

<b>Resetting index

In [14]:
census.reset_index(drop = True, inplace = True)
census

Unnamed: 0,Title 1,Title 2,Support,Confidence,Lift
0,native-country=United-States,education=HS-grad,0.305318,0.935874,1.026317
1,race=White,marital-status=Married-civ-spouse,0.419601,0.899822,1.04656
2,sex=Male,marital-status=Married-civ-spouse,0.417247,0.894774,1.324248
3,native-country=United-States,race=White,0.802931,0.880526,1.024118
4,income=Small,hours-per-week=Full-time,0.469001,0.799028,1.063843
5,income=Small,capital-gain=None,0.719813,0.785947,1.046426
6,income=Small,workclass=Private,0.577216,0.768518,1.040116
7,income=Small,capital-loss=None,0.728201,0.764364,1.01769
8,hours-per-week=Full-time,workclass=Private,0.447152,0.761805,1.031032
9,workclass=Private,capital-gain=None,0.682415,0.745113,1.00844


In [37]:
def arrangingRules(rules):
    for i in rules.index:
        val = rules.loc[i,'Confidence']
        if val >= 0.70:
            print(val)

In [38]:
arrangingRules(census)

0.9358739837398374
0.8998222538215428
0.8947742623533593
0.8805264688772542
0.7990284681427926
0.7859470026064292
0.7685177010682439
0.7643640160083522
0.7618052417532761
0.745112945264987
0.7442716597905427
0.742161127544806
