In [53]:
# This cell contains all the imports and needed methods

"""
DSCI-663-03 Project: Data Mining File
This file is used for our data mining tasks, currently attempting to implement apriori association rule mining

:language:      Python with pandas
:author:        Stephen Cook
:date created:  10/26/21
:last edit:     11/05/21
"""

#TODO: Implement tasks 5 and 6

import pandas as pd
import math
from itertools import combinations, permutations
import numpy as np
import matplotlib.pyplot as plt

"""
For our implementation of apriori, the dataframe needs to be converted into a list of records.
Our dataset has an additional challenge in that record responses are not binary, they are catagorical
As such, each item needs to be associated with a column in addition to it being made a list
:param:     df: the dataframe of the file we are running
:returns:   records:    an array of arrays, with each array representing a survey response
:returns:   items:      an array of all the items in then records
"""
def init_records(df):
    init_records = []        # init the record array
    # iterate through the dataframe
    for iterator in range(len(df)):
        raw_record = df.loc[iterator]       # get the raw record at the current location of the iteration
        record = []                         # initialize the record array
        # iterate through each value of the raw_record
        for column, value in raw_record.items():
            # create a string, columnID = value and add it to the record array
            attribute_string = str(column) + "=" + str(value)
            record.append(attribute_string)
        # add record to records
        init_records.append(record)

    # create the items array, the array of all items in the records
    return init_records
"""
A general method for the implementation of the apriori algorithm used to find item sets
:param:     iteration:  the current iteration of the apriori algorithm, also known as depth
:param:     records:    the list of records
:param:     min_sup:    the minimum support number
:param:     large_set:  the last large_set of items
:return:    count_set:  the set of all counts (Note this might not be needed)
:return:    large_return set:   The newly created large set
"""
def apriori(iteration, records, min_sup, large_set):
    large_return_set = {}  # initializes the large return set, the set of rules which pass min_sup
    count_set = {}  # initializes the count set, the set in which all rules and counts are held

    # if first iteration
    if iteration == 0:
        items  = sorted([item for sublist in records for item in sublist if item != 'nan'])
        count_set = {i: items.count(i) for i in items}

    # all other iterations
    else:
        large_set = sorted(list(large_set.keys()))  # convert the large set to a sorted list
        larger_set = list(combinations(large_set, iteration + 1))  # generate the larger set (the next level set)

        # iterate through the larger_set we created
        for a_set in larger_set:
            count = 0
            # for each of the records
            for record in records:
                # This checks subsets
                if set(a_set) <= set(record):
                    # increment count
                    count += 1
            count_set[a_set] = count

    # iterate through the count_set now
    for key, value in count_set.items():
        # if the count surpasses min_support_count, add it to the large_return_set
        if value >= min_sup:
            large_return_set[key] = value

    return count_set, large_return_set


In [49]:
file_name = input("Insert File Name: ")
mental_health_data_frame = pd.read_csv(file_name)

In [50]:
min_support_percent = input("Input Minimum Support Percentage (0-100): ")
min_support_float = float(int(min_support_percent)/100)
min_support_count = int(len(mental_health_data_frame) * min_support_float)

In [51]:
records = init_records(mental_health_data_frame)

In [52]:
# get the number of iterations we wish to run
# TODO: add in other metrics like confidence and lift
iterations = int(input("How many iterations of Apriori do you wish to run:"))
large_set = {}
iteration_count = 0
while iteration_count < iterations:
    count_set, large_set = apriori(iteration_count,records,min_support_count,large_set)
    print(large_set)
    iteration_count += 1

{'AA=I was aware of some': 321, 'AA=N/A (not currently aware)': 457, 'AAA=False': 488, 'AAA=True': 655, 'B=100-500': 247, 'B=26-100': 292, 'B=6-25': 209, 'B=More than 1000': 255, 'BB=None did': 681, 'BB=Some did': 229, 'BBB=Not applicable to me': 454, 'BBB=Rarely': 257, 'BBB=Sometimes': 283, 'C=False': 263, 'C=True': 880, 'CC=None did': 643, 'CC=Some did': 321, 'CCC=Not applicable to me': 383, 'CCC=Often': 421, 'CCC=Sometimes': 286, "DD=I don't know": 670, 'DDD=25-35': 612, 'DDD=36-45': 305, "E=I don't know": 319, 'E=No': 213, 'E=Yes': 528, "EE=I don't know": 244, 'EE=Some of them': 500, 'EE=Yes, all of them': 171, 'EEE=F': 280, 'EEE=M': 845, 'F=I am not sure': 351, 'F=No': 352, 'F=Yes': 307, 'FF=None of them': 453, 'FF=Some of them': 497, 'FFF=United States of America': 707, 'G=No': 812, 'G=Yes': 229, 'GG=No, at none of my previous employers': 359, 'GG=Some of my previous employers': 578, "H=I don't know": 318, 'H=No': 531, 'H=Yes': 294, 'HH=No, at none of my previous employers': 344,