# Association analysis

This notebook focuses on creating and analysing assocation rules found in posts.

We first load all the libraries we may use throughout the project

In [4]:
#Import graphing utilities
%matplotlib inline
import matplotlib.pyplot as plt

# Import useful mathematical libraries
import numpy as np
import pandas as pd

# Import useful Machine learning libraries
import gensim
from sklearn.cluster import KMeans

# Import utility files
from utils import save_object, load_object, make_post_clusters, make_clustering_objects

from orangecontrib.associate.fpgrowth import *

The minimum supported version is 2.4.6



### Set model name

Before begining the rest of this project, we select a name for our model. This name will be used to save and load the files for this model

In [5]:
# Set the model we are going to be analyzing
model_name = "PTSD_model"

### Prepare data

We now load and process the data we will need for the rest of this project

In [6]:
# Initialize a word clustering to use
num_word_clusters = 100

In [7]:
df = load_object('objects/', model_name + '-df')

# Load Our Saved matricies
PostsByWords = load_object('matricies/', model_name + "-PostsByWords")
WordsByFeatures = load_object('matricies/', model_name + "-WordsByFeatures")

# Generate the posts by Features matrix through matrix multiplication
PostsByFeatures = PostsByWords.dot(WordsByFeatures)
PostsByFeatures = np.matrix(PostsByFeatures)
model = gensim.models.Word2Vec.load('models/' + model_name + '.model')

vocab_list = sorted(list(model.wv.vocab))

kmeans =  load_object('clusters/', model_name + '-words-cluster_model-' + str(num_word_clusters))

clusters = make_clustering_objects(model, kmeans, vocab_list, WordsByFeatures)

clusterWords = list(map(lambda x: list(map(lambda y: y[0] , x["word_list"])), clusters))

from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(vocabulary = vocab_list, analyzer = (lambda lst:list(map((lambda s: s), lst))), min_df = 0)

# Make Clusters By Words Matrix
ClustersByWords = countvec.fit_transform(clusterWords)

# take the transpose of Clusters
WordsByCluster = ClustersByWords.transpose()

# Multiply Posts by Words by Words By cluster to get Posts By cluster
PostsByClusters = PostsByWords.dot(WordsByCluster)

In [8]:
PostsByClusters

<7057x100 sparse matrix of type '<class 'numpy.int64'>'
	with 275348 stored elements in Compressed Sparse Row format>

In [14]:
sorted_clusters = sorted(list(zip(clusters,range(len(clusters)))),key = (lambda x : x[0]['total_freq']))

large_indicies = list(map(lambda x: x[1],sorted_clusters[-20:]))

sorted_large_indicies = sorted(large_indicies, reverse =True)

X = np.array(PostsByClusters.todense())

for index in sorted_large_indicies:
    X = np.delete(X,index,1)

80

### Generate Rules

Test rule generation on a subset of the data, before moving on to the entirety of the data

In [66]:
assoc_confidence = 50
itemset_support  = 5

In [67]:
X_test = X[:700]

In [68]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [69]:
len(X_test)

700

In [None]:
itemsets = dict(frequent_itemsets(X_test > 0, itemset_support/
100))
assoc_rules = association_rules(itemsets, assoc_confidence/100)
rules = [(P, Q, supp, conf, conf/(itemsets[P]/X_test.shape[0]))
             for P, Q, supp, conf in assoc_rules
             if len(Q) == 1 and len(P)==1]

In [52]:
rules

[(frozenset({28}),
  frozenset({46}),
  99,
  0.6644295302013423,
  3.1214810143687224),
 (frozenset({37}),
  frozenset({15}),
  118,
  0.5870646766169154,
  2.0445038489146308),
 (frozenset({21}),
  frozenset({17}),
  142,
  0.6604651162790698,
  2.1503515413737153),
 (frozenset({17}),
  frozenset({21}),
  142,
  0.5819672131147541,
  1.6695780704111796),
 (frozenset({68}),
  frozenset({32}),
  70,
  0.5426356589147286,
  2.9445345832582177),
 (frozenset({71}),
  frozenset({30}),
  153,
  0.5907335907335908,
  1.596577272252948),
 (frozenset({30}),
  frozenset({71}),
  153,
  0.6740088105726872,
  2.0784412660831766),
 (frozenset({22}),
  frozenset({30}),
  141,
  0.5529411764705883,
  1.5178777393310268),
 (frozenset({30}),
  frozenset({22}),
  141,
  0.6211453744493393,
  1.915426264821751),
 (frozenset({21}),
  frozenset({39}),
  177,
  0.8232558139534883,
  2.6803677663601944),
 (frozenset({77}),
  frozenset({64}),
  176,
  0.6929133858267716,
  1.9096038192076383),
 (frozenset({6

# Correlation stuff

In [None]:
cluster_df = pd.DataFrame(data = X)

In [None]:
heading = list(range(num_word_clusters))

In [None]:
heading = list(range(num_word_clusters))
for index in sorted_large_indicies:
    del heading[index]

In [None]:
heading= list(map(lambda x: x+1, heading))

In [None]:
import csv
correlations = cluster_df.corr().values
with open(model_name+"-correlations_matrix.csv","w") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(heading)
    [writer.writerow(r) for r in correlations]

### Load results

We now load our results from running the association miner so we can analyze them

In [None]:
rules    = load_object('association_rules/',model_name+'-assoc_rules-'+str(itemset_support)+
                       '-'+str(assoc_confidence)+'-'+str(num_word_clusters))
itemsets = load_object('itemsets/',model_name+'-itemset-'+str(itemset_support)+'-'+str(num_word_clusters))

### Analyze results

after loading our results we analyze them

In [53]:
len(rules)

2299

In [54]:
len(itemsets)

1793169

In [55]:
len(rules)/len(itemsets)

0.0012820877452153143

In [56]:
rule_clusters =[]
for i in range(num_word_clusters):
    for lhs, rhs, support, confidence,lift in rules:
        if (i in lhs) or (i in rhs): 
            rule_clusters.append(i)
            break

In [57]:
len(rule_clusters)

69

In [58]:
rules.sort(key = lambda x : x[4],reverse = True)

In [59]:
filtered_rules = list(filter(lambda x: len(x[0])==1 and len(x[1])==1,rules ))

In [60]:
# load the models
model = gensim.models.Word2Vec.load('models/' + model_name + '.model')
kmeans = load_object('clusters/', model_name + "-words-cluster_model-" + str(num_word_clusters))
WordsByFeatures = load_object('matricies/', model_name + '-' + 'WordsByFeatures')

In [61]:
vocab_list = sorted(list(model.wv.vocab))

In [62]:
clusters = make_clustering_objects(model, kmeans, vocab_list, WordsByFeatures)

In [63]:
# Sort all the words in the words list
for cluster in clusters:
    cluster["word_list"].sort(key = lambda x:x[1], reverse = True)

In [64]:
len(filtered_rules)

2299

In [65]:
import csv
top_num = min(10000,len(filtered_rules))
header = ["lhs","rhs","support","confidence","lift"]
with open('association-analysis/'+"test-"+ model_name + "-filtered-lift-supp"+str(itemset_support) +
          "-conf-"+str(assoc_confidence)+'-'+ str(top_num) + '.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    for i in range(top_num):
        rule = filtered_rules[i]
        lhs_top = clusters[next(iter(rule[0]))]["word_list"][:5]
        rhs_top = clusters[next(iter(rule[1]))]["word_list"][:5]
        writer.writerow([lhs_top,rhs_top ,rule[2],rule[3],rule[4]])