In [1]:
# This cell contains all the imports and needed methods

"""
DSCI-663-03 Project: Data Mining File
This file is used for our data mining tasks, currently attempting to implement apriori association rule mining

:language:      Python with pandas
:author:        Stephen Cook
:date created:  10/26/21
:last edit:     11/27/21
"""

#TODO: Implement tasks 5 and 6

import pandas as pd
import math
from itertools import combinations, permutations
import numpy as np
import matplotlib.pyplot as plt

"""
For our implementation of apriori, the dataframe needs to be converted into a list of records.
Our dataset has an additional challenge in that record responses are not binary, they are catagorical
As such, each item needs to be associated with a column in addition to it being made a list
:param:     df: the dataframe of the file we are running
:returns:   records:    an array of arrays, with each array representing a survey response
:returns:   items:      an array of all the items in then records
"""
def init_records(df):
    init_records = []        # init the record array
    # iterate through the dataframe
    for iterator in range(len(df)):
        raw_record = df.loc[iterator]       # get the raw record at the current location of the iteration
        record = []                         # initialize the record array
        # iterate through each value of the raw_record
        for column, value in raw_record.items():
            # create a string, columnID = value and add it to the record array
            attribute_string = str(column) + "=" + str(value)
            record.append(attribute_string)
        # add record to records
        init_records.append(record)

    # create the items array, the array of all items in the records
    return init_records
"""
A general method for the implementation of the apriori algorithm used to find item sets
:param:     iteration:  the current iteration of the apriori algorithm, also known as depth
:param:     records:    the list of records
:param:     min_sup:    the minimum support number
:param:     large_set:  the last large_set of items
:return:    count_set:  the set of all counts (Note this might not be needed)
:return:    large_return set:   The newly created large set
"""
def apriori(iteration, records, min_sup, large_set):
    large_return_set = {}  # initializes the large return set, the set of rules which pass min_sup
    count_set = {}  # initializes the count set, the set in which all rules and counts are held

    # if first iteration
    if iteration == 0:
        items  = sorted([item for sublist in records for item in sublist if item != 'nan'])
        count_set = {i: items.count(i) for i in items}

    # all other iterations
    else:
        large_set = sorted(list(large_set.keys()))  # convert the large set to a sorted list
        larger_set = list(combinations(large_set, iteration + 1))  # generate the larger set (the next level set)

        # iterate through the larger_set we created
        for a_set in larger_set:
            count = 0
            # for each of the records
            for record in records:
                # This checks subsets
                if set(a_set) <= set(record):
                    # increment count
                    count += 1
            count_set[a_set] = count

    # iterate through the count_set now
    for key, value in count_set.items():
        # if the count surpasses min_support_count, add it to the large_return_set
        if value >= min_sup:
            large_return_set[key] = value

    return count_set, large_return_set

"""
prints the final set of rules along with metrics
:param:     final_set:  The final set of rules mined
"""
def print_rules(final_set):
    for rule in final_set:
        string = rule + "\n\t" + "Support: " + str(final_set[rule]['support']) + "\n\tConfidence: " + str(final_set[rule]['confidence'])
        print(string)

"""
Performs the association rule ming from all the apriori associations
:param:     rule_set                All the rules found by apriori which are above a support level
:param:     confidence              the confidence level inputed by the user
:param:     total_transactions      total number of records
:return"    final set               the final set of rules
"""
def mine_association_rules(rule_set, confidence, total_transactions):

    final_set = {}          # the final set of rules
    for rule_tuple in rule_set:
        # check if the rule is a tuple, if it isn't it is a base value and thus is ignored
        if not isinstance(rule_tuple,tuple):
            continue

        # Calculate Support = transactions containing both X and Y/Total Transactions
        rule_count = rule_set[rule_tuple]
        rule_support = float(rule_count)/float(total_transactions)

        # iterate through the tuples
        for iter_1 in range(0,len(rule_tuple)+1):
            for iter_2 in range(iter_1+1, len(rule_tuple)+1):

                # Initalize A
                A = rule_tuple[iter_1:iter_2]
                if len(A) > 1:
                    A = tuple(A)

                # Initalize B
                B = rule_tuple[:iter_1] + rule_tuple[iter_2:]
                if len(B) > 1:
                    B = tuple(B)

                # Ensure Neither is len 0
                if len(A) != 0 and len(B) != 0:
                    # Calculate Confidence(A->B) =  (transactions containing both (A and B))/transactions containing(A)
                    num = float(rule_set[rule_tuple])
                    dom = float(rule_set[A[0] if len(A) == 1 else A])
                    rule_confidence = num/dom
                    if rule_confidence >= confidence:
                    #     # Calculate Lift(A->B) = (transactions containing both A and B)/(Transactions containing A)/(Support(B))
                    #     num = float(rule_set[rule_tuple])
                    #     dom = float(rule_set[A[0] if len(A) == 1 else A])
                    #
                    #
                    #     rule_lift = (float(rule_set[rule_tuple]) / float(rule_set[A[0] if len(A) == 1 else A])) / float(rule_set[B[0] if len(B) == 0 else B])
                        rule_str = str(A if len(A) != 1 else A[0]) + ' -> ' + str(B if len(B) != 1 else B[0])
                        final_set[rule_str] = {'support': rule_support, 'confidence': rule_confidence}

    return final_set

In [2]:
file_name = input("Insert File Name: ")
mental_health_data_frame = pd.read_csv(file_name)
total_transactions = mental_health_data_frame.shape[0]

In [3]:
min_support_percent = input("Input Minimum Support Percentage (0-100): ")
min_support_float = float(int(min_support_percent)/100)
min_support_count = int(len(mental_health_data_frame) * min_support_float)

In [4]:
records = init_records(mental_health_data_frame)

In [5]:
# get the number of iterations we wish to run
# TODO: add in other metrics like confidence and lift
iterations = int(input("How many iterations of Apriori do you wish to run (HIGHER VALUES REQUIRE MORE TIME AND RESOURCES LEVEL 1 IS RECOMMENDED):"))


large_set = {}
rule_set = {}

iteration_count = 0
while iteration_count < iterations:
    count_set, large_set = apriori(iteration_count,records,min_support_count,large_set)
    rule_set.update(large_set)
    iteration_count += 1

In [6]:
confidence = float(input("Input the Minimum Confidence Percentage (0-100): "))/100

# MINE
final_set = mine_association_rules(rule_set,confidence, total_transactions)

In [8]:
print_rules(final_set)