# Post Cluster Analysis

This notebook focuses on analysing the posts clusters for a model. We will do this by first generating 
We first import the libaries we will need throughout the project

In [1]:
#Import graphing utilities
%matplotlib inline
import matplotlib.pyplot as plt

# Import useful mathematical libraries
import numpy as np
import pandas as pd

# Import useful Machine learning libraries
import gensim
from sklearn.cluster import KMeans

# Import utility files
from utils import save_object,load_object, make_post_clusters, make_clustering_objects

#### Setup directories

If this is the first time doing this analysis, 
we first will set up all the directories we need
to save and load the models we will be using

In [2]:
import os
directories = ['post-analysis']
for dirname in directories:
    if not os.path.exists(dirname):
        os.makedirs(dirname)

### Set model name

Before begining the rest of this project, we select a name for our model. This name will be used to save and load the files for this model

In [3]:
# Set the model we are going to be analyzing
model_name = "model6"

### Prepare data

We now load and process the data we will need for the rest of this project

In [13]:
df = load_object('objects/',model_name+'-df')

scores = list(df['score'])
num_comments_list = list(df['num_comments'])

In [4]:
# Load Our Saved matricies
PostsByWords    = load_object('matricies/',model_name+"-PostsByWords")
WordsByFeatures = load_object('matricies/',model_name+"-WordsByFeatures")

# Generate the posts by Features matrix through matrix multiplication
PostsByFeatures = PostsByWords.dot(WordsByFeatures)
PostsByFeatures = np.matrix(PostsByFeatures)
len(PostsByFeatures)

131652

In [5]:
model = gensim.models.Word2Vec.load('models/'+model_name+'.model')

vocab_list = sorted(list(model.wv.vocab))

# Initialize a word clustering to use
num_word_clusters = 100
kmeans =  load_object('clusters/',model_name+'-words-cluster_model-'+str(num_word_clusters))

clusters = make_clustering_objects(model,kmeans,vocab_list,WordsByFeatures)

clusterWords = list(map(lambda x: list(map( lambda y: y[0] ,x["word_list"])), clusters))

from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(vocabulary =vocab_list,analyzer=(lambda lst:list(map((lambda s: s),lst))),min_df=0)

# Make Clusters By Words Matrix
ClustersByWords = countvec.fit_transform(clusterWords)

# Ensure consistency
len(WordsByFeatures)==ClustersByWords.shape[1]

True

In [7]:
# take the transpose of Clusters
WordsByCluster = ClustersByWords.transpose()

# Multiply Posts by Words by Words By cluster to get Posts By cluster
PostsByClusters = PostsByWords.dot(WordsByCluster)

In [20]:
import math
for row in PostsByClustersNormed.tolist():
    for col in row:
        if (math.isnan(col)):
            print ("mep")
            print(col)
            break

In [10]:
PostsByClusters=PostsByClusters.todense() *1.0

In [18]:
row_min = PostsByFeatures.min(axis=1)
row_max = PostsByFeatures.max(axis=1)
PostsByFeaturesNormed = np.nan_to_num((PostsByFeatures-row_min)/ (row_max-row_min))

row_min = PostsByClusters.min(axis=1)
row_max = PostsByClusters.max(axis=1)
PostsByClustersNormed =np.nan_to_num((PostsByClusters-row_min)/ (row_max-row_min))

In [None]:
PostsByClusters[0]

In [None]:
PostsByClustersNormed

In [None]:
PostsByFeaturesNormed

### Generate Post Clusters

We now will generate post clusters, and then save them in a format conducive to analysis.

In [None]:
num_posts_clusters =10
matricies = [PostsByFeatures,PostsByClusters,PostsByFeaturesNormed,PostsByClustersNormed]
names     = ["byFeatures","byClusters","byFeatures-Normed","byClusters-Normed"]
mat_names = list(zip(matricies,names))
post_dfs  = []

In [None]:
for mat,name in mat_names:
    #initialize kmeans model
    kmeans = KMeans(n_clusters=num_posts_clusters, random_state=42).fit(mat)
    # Save the clusters directory
    save_object(kmeans,'clusters/',model_name+"-posts-"+name+"-"+str(num_posts_clusters))
    del kmeans

In [None]:
header = ['total_posts','score_mean','score_median','score_range','comments_mean','comments_median','comments_range']
for mat,name in mat_names:
    kmeans = load_object('clusters/',model_name+"-posts-"+name+"-"+str(num_posts_clusters))
    post_clusters = make_post_clusters(kmeans,mat,scores,num_comments_list)
    lst= mat.tolist()
    temp_header =header+list(map(lambda x:"element "+str(x),range(1,mat.shape[1]+1)))
    temp_table = list(map(lambda x: list(map(lambda y: x[1][y],header))+lst[x[0]],enumerate(post_clusters)))
    post_dfs.append(pd.DataFrame.from_records(temp_table,columns =temp_header))

    import csv
    with open('post-analysis/'+model_name+'-'+str(num_posts_clusters)+'-'+name+'.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(temp_header)
        [writer.writerow(r) for r in temp_table]

### Un normed Posts by Features Clusters

We now will make un-normalized posts by features clusters, and then save them.

In [None]:
#initialize kmeans model
kmeans = KMeans(n_clusters=num_posts_clusters, random_state=42).fit(PostsByFeatures)
# Save the clusters directory
save_object(kmeans,'clusters/',model_name+"-posts-byFeatures-cluster_model-"+str(num_posts_clusters))
del kmeans

In [None]:
kmeans = load_object('clusters/',model_name+"-posts-byFeatures-cluster_model-"+str(num_posts_clusters))

In [None]:
post_clusters = make_post_clusters(kmeans,PostsByFeatures,scores,num_comments_list)

In [None]:
post_clusters[0]['score_mean']
post_clusters[0]['score_median']
post_clusters[0]['score_range']
post_clusters[0]['total_posts']

post_clusters[0]['comments_mean']
post_clusters[0]['comments_median']
post_clusters[0]['comments_range']

In [None]:
header = ['total_posts','score_mean','score_median','score_range','comments_mean','comments_median','comments_range']

temp_table = list(map(lambda x: list(map(lambda y: x[y],header)),post_clusters))

posts_by_features_df = pd.DataFrame.from_records(temp_table,columns =header)

import csv
with open('post-analysis/'+model_name+'-'+str(num_posts_clusters)+'-byFeatures'+'.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    [writer.writerow(r) for r in temp_table]

### Un normed Posts by Clusters Clusters

We now will make un-normalized Posts by Clusters clusters, and then save them.

In [None]:
#initialize kmeans model
kmeans = KMeans(n_clusters=num_posts_clusters, random_state=42).fit(PostsByClusters)
# Save the clusters directory
save_object(kmeans,'clusters/',model_name+"-posts-byClusters-cluster_model-"+str(num_posts_clusters))
del kmeans

In [None]:
kmeans = load_object('clusters/',model_name+"-posts-byClusters-cluster_model-"+str(num_posts_clusters))

In [None]:
header = ['total_posts','score_mean','score_median','score_range','comments_mean','comments_median','comments_range']

temp_table = list(map(lambda x: list(map(lambda y: x[y],header)),post_clusters))

posts_by_cluster_df = pd.DataFrame.from_records(temp_table,columns =header)

import csv
with open('post-analysis/'+model_name+'-'+str(num_posts_clusters)+'-byClusters'+'.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    [writer.writerow(r) for r in temp_table]

### Normed Posts by Features Clusters

We now will make normalized Posts by Clusters clusters, and then save them.

### Normed Posts by Clusters Clusters

We now will make normalized Posts by Clusters clusters, and then save them.