# Correlation Analysis

In this notebook we will attempt to analyze the correlations between different clusters in the posts.

We first load all necesary libraries

In [1]:
#Import graphing utilities
%matplotlib inline
import matplotlib.pyplot as plt

# Import useful mathematical libraries
import numpy as np
import pandas as pd

# Import useful Machine learning libraries
import gensim
from sklearn.cluster import KMeans

# Import utility files
from utils import save_object, load_object, make_post_clusters, make_clustering_objects

from orangecontrib.associate.fpgrowth import *

The minimum supported version is 2.4.6



In [2]:
# Set the model we are going to be analyzing
model_name = "ADHD1"

### Make Correlation Matrix

In [3]:
# Initialize a word clustering to use
num_word_clusters = 100
# Initialize the threshold to count a correlation
correlation_threshold = 0.65

In [4]:
df = load_object('objects/', model_name + '-df')

# Load Our Saved matricies
PostsByWords = load_object('matricies/', model_name + "-PostsByWords")
WordsByFeatures = load_object('matricies/', model_name + "-WordsByFeatures")

# Generate the posts by Features matrix through matrix multiplication
PostsByFeatures = PostsByWords.dot(WordsByFeatures)
PostsByFeatures = np.matrix(PostsByFeatures)
model = gensim.models.Word2Vec.load('models/' + model_name + '.model')

vocab_list = sorted(list(model.wv.vocab))

kmeans =  load_object('clusters/', model_name + '-words-cluster_model-' + str(num_word_clusters))

clusters = make_clustering_objects(model, kmeans, vocab_list, WordsByFeatures)

clusterWords = list(map(lambda x: list(map(lambda y: y[0] , x["word_list"])), clusters))

from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(vocabulary = vocab_list, analyzer = (lambda lst:list(map((lambda s: s), lst))), min_df = 0)

# Make Clusters By Words Matrix
ClustersByWords = countvec.fit_transform(clusterWords)

# take the transpose of Clusters
WordsByCluster = ClustersByWords.transpose()

# Multiply Posts by Words by Words By cluster to get Posts By cluster
PostsByClusters = PostsByWords.dot(WordsByCluster)

In [5]:
X = np.array(PostsByClusters.todense())

In [6]:
cluster_df = pd.DataFrame(data = X)

In [7]:
correlations = cluster_df.corr().values

In [8]:
# Sort all the words in the words list
for cluster in clusters:
    cluster["word_list"].sort(key = lambda x:x[1], reverse = True)

In [9]:
correlations_list = []
for i in range(len(correlations)):
    for j in range(i+1,len(correlations[0])):
        corr_val = correlations[i][j]
        if corr_val > correlation_threshold:
            correlations_list.append([i,j,corr_val,clusters[i]["word_list"][:5],clusters[j]["word_list"][:5]])

In [10]:
len(correlations_list)

129

In [11]:
correlations_list

[[3,
  6,
  0.79548957519921693,
  [('i', 535156),
   ('and', 310199),
   ('a', 242372),
   ('it', 148143),
   ('that', 128064)],
  [('do', 61744),
   ('make', 12200),
   ('find', 11436),
   ('keep', 7870),
   ('able', 5035)]],
 [3,
  16,
  0.68914223546037157,
  [('i', 535156),
   ('and', 310199),
   ('a', 242372),
   ('it', 148143),
   ('that', 128064)],
  [('been', 24612),
   ('since', 11348),
   ('ive_been', 10559),
   ('years', 8671),
   ('year', 5689)]],
 [3,
  18,
  0.79678357756738349,
  [('i', 535156),
   ('and', 310199),
   ('a', 242372),
   ('it', 148143),
   ('that', 128064)],
  [('some', 24841),
   ('other', 13101),
   ('lot', 12478),
   ('these', 9147),
   ('most', 8148)]],
 [3,
  31,
  0.8794464230583906,
  [('i', 535156),
   ('and', 310199),
   ('a', 242372),
   ('it', 148143),
   ('that', 128064)],
  [('my', 203658),
   ('was', 80310),
   ('had', 32794),
   ('ive', 22719),
   ('after', 19616)]],
 [3,
  36,
  0.79445868548214038,
  [('i', 535156),
   ('and', 310199),
  

In [12]:
import os
directories = ['correlation-analysis']
for dirname in directories:
    if not os.path.exists(dirname):
        os.makedirs(dirname)

In [13]:
import csv
heading = ["cluster 1 number", "cluster 2 number", "correlation values","cluster 1","cluster 2"]
with open("correlation-analysis/"+model_name+"-correlations.csv","w") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(heading)
    [writer.writerow(r) for r in correlations_list]