# Suicide Watch analysis
This notebook will walk you through building the models we
built after collecting our data from the Suicide Watch Subreddit

We first import the libraries and utility files we are going to be using,
and parse and clean our data.

In [2]:
%matplotlib inline
from clusterUtils import make_post_clusters

# Import machine learning libraries
import gensim
import textmining
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy.linalg as LA
import scipy.sparse as sparse
from sklearn.manifold import MDS
from sklearn.cluster import KMeans
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# Import utility files
import dataUtils
import clusterUtils

In [3]:
# Get the data from the csv
df = dataUtils.read_df('data')

In [None]:
#Import only one year of data
import glob
frame = pd.DataFrame()
df_list =[]
fnames = glob.glob('data' + "/2016*.csv")
for fname in fnames:
    df = pd.read_csv(fname,header=0)
    df_list.append(df)
frame = pd.concat(df_list)
df = frame

In [4]:
# Clean the text in the datafram
df =df.replace(np.nan, '', regex=True)
df["rawtext"]= df["title"]+" "+df["selftext"]
df["cleantext"]=df["rawtext"].apply(dataUtils.remove_links).apply(dataUtils.cleanSentence)

In [None]:
# Get a stream of text
posts= df["cleantext"].apply(lambda str: str.split()).tolist()

In [None]:
# Train a phraseDetector
two_word_phrases = gensim.models.Phrases(posts)

In [None]:
two_word_phraser = gensim.models.phrases.Phraser(two_word_phrases)

In [None]:
# phrase_length =3
#posts = list(two_word_phraser[posts])
three_word_phrases = gensim.models.Phrases(two_word_phraser[posts])
three_word_phraser = gensim.models.phrases.Phraser(three_word_phrases)
posts              = list(three_word_phraser[two_word_phraser[posts]])

In [None]:
# update clean text
df["cleantext"]=df["cleantext"].apply(lambda str: " ".join(three_word_phraser[two_word_phraser[str.split()]]))

#### Data summary statistics

Before building models, we first look at that data that we are using.

In [None]:
# Get the number of posts
num_posts = len(posts)
num_posts

In [None]:
#get the number of users (minus [deleted])
userList= df["author"].tolist()
userDict = {}
for user in userList:
    if user in userDict.keys() and user != "[deleted]":
        userDict[user] =1+userDict[user]
    else:
        userDict[user] =1
len(list(userDict.keys()))

In [None]:
len(posts)

#### Build word2vec model
At this step we will build the word2vec model that we will use in the rest of the analysis.
Becuase this is a compuationally expensive process, we save the results of running our model
as the value of model_name +".model" in the models directory. We can then load this model later, and do not need
to re build it every time we want to analyze it.

In [5]:
model_name = "model6"

In [None]:
dataUtils.save_object(posts,'objects/',model_name+"-posts")

In [6]:
posts = dataUtils.load_object('objects/',model_name+"-posts")

In [None]:
# Build the model
model = gensim.models.Word2Vec(posts,min_count =10,
                               sg=1, size =300,window=5,hs=1,negative=20)
model.save('models/'+model_name+'.model')
del model

In [7]:
# load the model
model = gensim.models.Word2Vec.load('models/'+model_name+'.model')
# Test the model: you should see cat somewhere in this list, near the top
model.most_similar(positive=["kitten"])

[('cat', 0.4867851734161377),
 ('kitty', 0.4587540030479431),
 ('dog', 0.43468546867370605),
 ('baby', 0.42002180218696594),
 ('pet', 0.41659820079803467),
 ('chihuahua', 0.4150254726409912),
 ('puppy', 0.4139459729194641),
 ('stuffed_animal', 0.40899857878685),
 ('german_shepherd', 0.3876553177833557),
 ('bunny', 0.38353854417800903)]

#### Test Model

At this step we run some basic tests to ensure that the model has picked up on some of the semantic meanings of words.

In [None]:
model.most_similar(positive=["kitten"])

In [None]:
model.most_similar(positive=["heartbreak"])

In [None]:
model.most_similar(positive=["pills"])

In [None]:
model.most_similar(positive=["knife"])

In [None]:
model.most_similar(positive=["heartbreak"])

In [None]:
model.most_similar(positive=["family","obligation"],negative =["love"])

In [None]:
model.most_similar(positive=["drugs","hurt"],negative =["help"])

In [None]:
model.most_similar(positive=["drugs","help"],negative =["hurt"])

#### Word usage summary

At this step, after our model has looked at all the words, 
and filtered some out, we will look at the words used by our model.

In [8]:
# Initialize the list of words used
vocab_list = sorted(list(model.wv.vocab))

In [None]:
unique_words = len(vocab_list)
unique_words

In [None]:
total_freq = 0
for word in vocab_list:
    total_freq += model.wv.vocab[word].count
total_freq

In [9]:
temp_list =list(map(lambda s:re.sub("_","_",s),vocab_list))
countvec = CountVectorizer(vocabulary =temp_list,analyzer=(lambda lst:list(map((lambda s:re.sub("_","_",s)),lst))),min_df=0)

In [10]:
tfidf    = TfidfTransformer()

In [11]:
PostsByWords = countvec.fit_transform(posts)

In [12]:
# Inspect a bug with creating PostsByWords
temp = PostsByWords.sum(axis=0).tolist()[0]
ctr =0
for i in range(len(temp)):
    if temp[i] < model.wv.vocab[vocab_list[i]].count:
        print("<:  "+vocab_list[i],temp[i]-model.wv.vocab[vocab_list[i]].count,temp[i],model.wv.vocab[vocab_list[i]].count)
    elif temp[i] > model.wv.vocab[vocab_list[i]].count:
        print(">:  "+vocab_list[i],temp[i]-model.wv.vocab[vocab_list[i]].count,temp[i],model.wv.vocab[vocab_list[i]].count)

In [13]:
# Calculate the magnitude of the error
sum(temp)-sum(list(map(lambda i: model.wv.vocab[vocab_list[i]].count, range(len(vocab_list)))))

0

In [14]:
# compare PostsByWords values to correct values
PostsByWords.sum(axis=0).tolist()[0]==list(map(lambda i: model.wv.vocab[vocab_list[i]].count, range(len(vocab_list))))

True

In [15]:
tfidf.fit(PostsByWords)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [16]:
PostsByWords_tfidf = tfidf.fit_transform(PostsByWords)

In [17]:
tfidf_list = list(PostsByWords_tfidf.sum(axis=0).tolist()[0])

#### Run Clustering
At this step we run and analyze the KMeans clustering algorithm 
implemented by sklearn on the word vectors we got from word2vec.

The first step for this proccess is to extract the word vectors,
and the words they correspond with from the model. We then tests 
different values of K to observe the effect of the number of centers on the fit of the model.
After this we select a value of K to use to get the clusterings. 
We then save this result in the directory "clustures" with the name model_name + num_centers+".pkl", to save future computational time

We then use the kmeans model to generate a list of dictionaries, where each dictionary corresponds to a cluster, and contains following fields:
    'unique_words': The number of different unique words in the cluster
    'total_freq'  : The total number of times one of the words in the cluster appeared in the corpus
    'word_list'   : A list of words in the cluster, paired with how often they appeared in the cluster

Finally we print a representation of this list to a csv, so that the clusters can be manuelly inspected.
This representation includes the number of unique words in the cluster, the total frequency of words in the cluster, and the size_words_list most frequent words in the cluster

In [18]:
# Extract the word vectors
vecs = []
for word in vocab_list:
    vecs.append(model.wv[word].tolist())

In [19]:
# change array format into numpy array
WordByFeatureMat = np.array(vecs)

In [None]:
# get the fit for different values of K
test_points = [12]+ list(range(25,401,25))
fit = []
for point in test_points:
    tempMeans = KMeans(n_clusters=point, random_state=42).fit(WordByFeatureMat)
    fit.append(tempMeans.inertia_)

In [None]:
# Save the fit values for this model
dataUtils.save_object(fit,'objects/',model_name+"-fit")
dataUtils.save_object(test_points,'objects/',model_name+"-testpoints")
del fit
del test_points

In [None]:
# Load the fit and test point values
fit         = dataUtils.load_object('objects/',model_name+"-fit")
test_points = dataUtils.load_object('objects/',model_name+"-testpoints")

In [None]:
fit1         = dataUtils.load_object('objects/',"model1-fit")
test_points1 = dataUtils.load_object('objects/',"model1-testpoints")
fit2         = dataUtils.load_object('objects/',"model2-fit")
test_points2 = dataUtils.load_object('objects/',"model2-testpoints")
fit3         = dataUtils.load_object('objects/',"model3-fit")
test_points3 = dataUtils.load_object('objects/',"model3-testpoints")

In [None]:
# graph the fit for different values of K
plt.plot(test_points1,fit1,'ro')
plt.plot(test_points2,fit2,'bo')
plt.plot(test_points3,fit3,'yo')
plt.show()

In [20]:
# set the number of clusters
num_clusters = 100

In [None]:
#initialize kmeans model
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(WordByFeatureMat)
# Save the clusters directory
dataUtils.save_object(kmeans,'clusters/',model_name+"-"+str(num_clusters))
del kmeans

In [99]:
# load kmeans
kmeans = dataUtils.load_object('clusters/',model_name+"-"+str(num_clusters))

In [100]:
clusters = clusterUtils.make_clustering_objects_tfidf(model,kmeans,vocab_list,tfidf_list,WordByFeatureMat)

In [101]:
dataUtils.save_object(clusters,'clusters/',model_name+"-clusters_dict-"+str(num_clusters))

In [None]:
# determine the total words in the clusters, and the total number of unique words in the clusters
clusters_total_words  = 0
clusters_unique_words = 0
for cluster in clusters:
    clusters_total_words  += cluster['total_freq']
    clusters_unique_words += cluster['unique_words']

In [None]:
# Check that the total number of words in clusters matches the total
clusters_total_words   

In [None]:
# Check that the number of unique words in clusters matches the total number of unique words
clusters_unique_words

##### Print clusters

Print clusters so we can analyze them

In [None]:
# Sort all the words in the words list
for cluster in clusters:
    cluster["word_list"].sort(key=lambda x:x[1],reverse = True)

In [None]:
size_words_list =100
table =[]
for i in range(len(clusters)):
    row =[]
    row.append("cluster " + str(i+1))
    row.append(clusters[i]["total_freq"])
    row.append(clusters[i]["unique_words"])
    for j in range(size_words_list):
        try:
            row.append(clusters[i]["word_list"][j])
        except:
            break
    table.append(row)

In [None]:
import csv
with open('tfidf-clusters-'+model_name+"-"+str(num_clusters)+'.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    [writer.writerow(r) for r in table]

#### Display Clusters Using MDS

Produce a visualization of our clusters in a low dimensional space

In [None]:
# Fit the model to the clusters
mds = MDS().fit(kmeans.cluster_centers_)

In [None]:
top_words= list(map(lambda x: x[0][0],map(lambda x: x["word_list"],clusters)))

In [None]:
# Get the embeddings
embedding = mds.embedding_.tolist()
x = list(map(lambda x:x[0],embedding))
y = list(map(lambda x:x[1],embedding))

In [None]:
len(top_words)

In [None]:
# Plot the Graph with top words
plt.figure(figsize=(20,10))
plt.plot(x,y,'bo')
for i in range(len(top_words)):
    plt.annotate(top_words[i],(x[i],y[i]))
plt.show()

In [None]:
def helper(indicies,points):
    temp=[]
    for i in indicies:
        temp.append(points[i-1])
    return temp

bullying = [59,16,47]
crime    = [31,73]
depressive_feelings = [1,3,15,21,29,45,81,4,30]
depressive_symptoms = [9,13,28] 
drug_abuse =[22,41,75]
illness  = [35,87]
failure = [68,89,90,14,19,26,52]
prior_suicide = [27,56,79]
psychological =[78,10,44,66,85]
self_harm  = [5,17]
self_image = [69,8,96]
death_around = [76,93]
suicidal_ideation =[36,38,57,58,97,6]
identified =bullying+crime+depressive_feelings+depressive_symptoms
identified = identified +drug_abuse+illness+failure+prior_suicide+psychological
identified = identified +self_harm+self_image+death_around+suicidal_ideation
other = [x for x in range(1,101) if x not in identified]
all_categories = [bullying,crime,depressive_feelings,depressive_symptoms,
                  drug_abuse,illness,failure,prior_suicide,psychological,
                  self_harm, self_image,death_around,suicidal_ideation,other]
colors = ["black" for x in all_categories]

"""
colors = ["#ff66ff","#6666ff","#000099",
          "#33cccc","#00cc66","#336600",
          "#ccff33","#cc6600","#ff0000",
          "#cc0066","#ffccff","#ccffff","#00ff00","#00ffff"]
"""
#colors[0]="grey"  # Bullying
colors[2]="red"   # Depressive Feelings
#colors[4]="green" # Drug Abuse
#colors[6]="blue"  # Poor performance
colors[3]="magenta" # Depressive symptoms
colors[8]="cyan" # Psychological 


# Plot the Graph with top words
plt.figure(figsize=(10,5))
for i in range(len(all_categories)):
    category = all_categories[i]
    color = colors[i]
    plt.scatter(helper(category,x),helper(category,y),color=color,s=100)
plt.show()

#### Prepare for regression :TODO

At this step, we will initialize the matricies we need to run a linear regression algorithm.
We will need to create a document term matrix, and a words by cluster matrix.
We will first use sklearn's CountVectorizer function to create the document term matrix. 
We will create the words by cluster matrix by giving each word a one hot vector, with a
one in the cluster number, and a 0 everywhere else.

In [23]:
countvec = CountVectorizer(vocabulary =vocab_list,analyzer=(lambda lst:list(map((lambda s:re.sub("_","_",s)),lst))),min_df=0)

In [24]:
# Make Posts By Words Matrix
PostsByWords = countvec.fit_transform(posts)

In [25]:
clusterWords = list(map(lambda x: list(map( lambda y: y[0] ,x["word_list"])), clusters))

In [26]:
# Make Clusters By Words Matrix
ClustersByWords = countvec.fit_transform(clusterWords)

In [27]:
ClustersByWords

<100x28663 sparse matrix of type '<class 'numpy.int64'>'
	with 28663 stored elements in Compressed Sparse Row format>

In [28]:
# Check that the number of elements in ClustersBy words is equal to the total number of words
ctr = 0
for cluster in clusters:
    ctr += cluster["unique_words"]
ctr

28663

In [29]:
# take the transpose of Clusters
WordsByCluster = ClustersByWords.transpose(copy=True)

In [30]:
# Multiply Posts by Words by Words By cluster to get Posts By cluster
PostsByCluster = PostsByWords.dot(WordsByCluster)

In [None]:
from datetime import datetime
df["date"]=df["created_utc"].apply(datetime.fromtimestamp)

In [None]:
df["year"]= df["date"].apply(lambda x: x.year-2000)

In [None]:
df["month"]= df["date"].apply(lambda x: x.month)

In [None]:
df["day"]= df["date"].apply(lambda x: x.day)

In [None]:
df["hour"]= df["date"].apply(lambda x: x.hour)

In [None]:
years = df["year"].values

In [None]:
months = df["month"].values

In [None]:
temp = np.zeros((len(months), 8))

In [None]:
temp[np.arange(len(months)), list(map(lambda x: x//4, months))] = 1

In [None]:
months = temp

In [None]:
years = df["year"].values
years = np.matrix(list(map(lambda x: [x],np.array(years))))

In [None]:
months = df["month"].values
temp = np.zeros((len(months), 4))
temp[np.arange(len(months)), list(map(lambda x: (x-1)//3 , months))] = 1
months = temp

In [None]:
hours = df["hour"].values
temp = np.zeros((len(hours), 6))
temp[np.arange(len(hours)), list(map(lambda x: x//4, hours))] = 1
hours = temp

In [None]:
years

In [None]:
time = np.concatenate((years,months,hours),axis=1)

In [None]:
np.array(time)


In [None]:
temp_time =np.array(time).tolist()
temp = [None]*len(temp_time[0])
for i in range(len(temp_time[0])):
    temp[i] =0
for t in temp_time:
    for i in range(len(t)):
        temp[i]+=t[i]

In [None]:
temp

In [None]:
PostsByClusterList =PostsByCluster.toarray().tolist()

In [None]:
len(PostsByClusterList)

In [None]:
# Remove high comments posts
lst = list(df.num_comments)
indicies =[]
for i in range(len(lst)):
    if lst[i] <=10:
        indicies.append(i)

In [None]:
PostsByClusterList = [x for ind, x in enumerate(PostsByClusterList) if ind in indicies]

In [None]:
PostsByCluster_clean = sparse.csr_matrix(PostsByClusterList)

In [None]:
PostsByCluster_clean

In [None]:
PostsByCluster.shape

In [None]:
sum(PostsByCluster.sum(axis=0).tolist()[0])==sum(PostsByWords.sum(axis=0).tolist()[0])

#### Run regression

At this stage we run a regression on the normalized PostsByCluster matrix

In [91]:
# initialize regression fields
regression_fields = ["ups","downs","score","num_comments"]

In [None]:
#initialize regression data
regression_data=[None]*len(regression_fields)
for i in range(len(regression_fields)):
    regression_data[i]= (list(np.log(df[regression_fields[i]].apply(lambda x: x if x>0 else 0.1 ))))
#    regression_data[i]= list(df[regression_fields[i]])
#    regression_data[i]= [x for ind, x in enumerate(regression_data[i]) if ind in indicies]

In [None]:
# initialize model
import statsmodels.api as sm
regression_models =[None]*len(regression_fields)

In [None]:
# modify PostsByCluster to have a bias colum
#X = PostsByCluster_clean.toarray()
#X = np.array(time)
X = np.array(np.concatenate((PostsByCluster.toarray(),time),axis=1))
X = sm.add_constant(X)

In [None]:
# Create a regression
for i in range(len(regression_fields)):
    model = sm.OLS(regression_data[i], X)
    regression_models[i] = model.fit()

In [None]:
# initialize regression coeficients
regression_coefs = [None]*len(regression_fields)
for i in range(len(regression_fields)):
    regression_coefs[i]= regression_models[i].params.tolist()

In [None]:
for i in range(len(regression_fields)):
    field =regression_fields[i]
    for j in range(len(clusters)):
        clusters[j][field]  = regression_coefs[i][j+1]

In [None]:
regression_coef_locs=[None]*len(regression_fields)

for i in range(len(regression_coef_locs)):
    field =regression_fields[i]
    regression_coef_locs[i]=[]
    for j in range(len(clusters)):   
        if clusters[j][field] != 0.0:
            regression_coef_locs[i].append((clusters[j][field],j))

In [None]:
# Sort all the words in the words list
for cluster in clusters:
    cluster["word_list"].sort(key=lambda x:x[1],reverse = True)

In [None]:
size_words_list =100
regression_tables= [None]*len(regression_fields)
for i in range(len(regression_coef_locs)):
    lst = sorted(regression_coef_locs[i],reverse=True)
    regression_tables[i]=[]
    for beta,k in lst:
        row =[]
        row.append(regression_fields[i]+" " + str(k+1))
        row.append(beta)
        for j in range(size_words_list):
            try:
                row.append(clusters[k]["word_list"][j])
            except:
                break
        regression_tables[i].append(row)

In [None]:
import csv
for i in range(len(regression_fields)):
    with open('regression-'+regression_fields[i]+'-'+model_name+'.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        [writer.writerow(r) for r in regression_tables[i]]

In [None]:
# log a summary of each regression
#regression_settings= "log-2016+under10-clusters-regularized"
regression_settings= "log-clusters+time"
for i in range(len(regression_fields)):
    f = open("regression/"+model_name+"-"+regression_settings+"-"+regression_fields[i]+".txt","w")
    f.write(str(regression_models[i].summary()))
    f.close()

In [85]:
values_array= np.array(list(zip(df['ups'].tolist(),df['downs'].tolist(),df['score'].tolist(),df['num_comments'].tolist())))

131652

In [90]:
temp_table= np.concatenate((PostsByCluster.toarray(),values_array),axis=1).tolist()

In [93]:
header = list(map(lambda x: "cluster "+ str(x),range(1,1+len(PostsByCluster.toarray()[0]))))+regression_fields

In [98]:
import csv
with open('regression_posts.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    [writer.writerow(r) for r in temp_table]

### Cluster Posts

In this section, we will cluster posts, similar to how we clustered word vectors above.

In [None]:
# get the fit for different values of K
test_points = [12]+ list(range(25,401,25))
fit = []
for point in test_points:
    tempMeans = KMeans(n_clusters=point, random_state=42).fit(WordByFeatureMat)
    fit.append(tempMeans.inertia_)

In [None]:
# Save the fit values for this model
dataUtils.save_object(fit,'objects/',model_name+"-posts_by_cluster"+"-fit")
dataUtils.save_object(test_points,'objects/',model_name+"-posts_by_cluster"+"-testpoints")
del fit
del test_points

In [None]:
# Load the fit and test point values
fit         = dataUtils.load_object('objects/',model_name+"-posts_by_cluster"+"-fit")
test_points = dataUtils.load_object('objects/',model_name+"-posts_by_cluster"+"-testpoints")

In [None]:
plt.plot(test_points,fit,'yo')
plt.axis([0, 400, 0, 260000])
plt.show()

In [33]:
num_posts_clusters =100

In [32]:
#initialize kmeans model
kmeans = KMeans(n_clusters=num_posts_clusters, random_state=42).fit(PostsByCluster)
# Save the clusters directory
dataUtils.save_object(kmeans,'clusters/',model_name+"-"+"posts_by_cluster"+"-"+str(num_posts_clusters))
del kmeans

KeyboardInterrupt: 

In [102]:
# load kmeans
kmeans = dataUtils.load_object('clusters/',model_name+"-"+"posts_by_cluster"+"-"+str(num_posts_clusters))

In [103]:
import importlib
importlib.reload(clusterUtils)

scores            = df['score'].tolist()
num_comments_list = df['num_comments'].tolist()
clusters = clusterUtils.make_post_clusters(kmeans,PostsByCluster,scores,num_comments_list)

In [52]:
for cluster in clusters:
    cluster['center'].sort(key=lambda x:x[0],reverse = True )

In [76]:
for cluster in clusters:
    print(cluster['score_median'])

50

### temp

In [None]:
# Record the distribution of scores
dictArr =[{},{},{},{}]
for i in range(len(regression_data)):
    data = regression_data[i]
    dictionary = dictArr[i]
    for val in data:
        if val in dictionary.keys():
            dictionary[val] =1+dictionary[val]
        else:
            dictionary[val] =1

In [None]:
total = [None]*len(regression_fields)
new   = [None]*len(regression_fields)
for i in range(len(regression_fields)):
    total[i]=0
    new[i]=0
    for k in dictArr[i].keys():
        total[i] += dictArr[i][k]
        if k<=10:
            new[i] += dictArr[i][k]

In [None]:
# Check correlations between clusters and scores
for i in range(len(clusters)):
    df["cluster "+str(i)]= PostsByCluster[:,i].toarray().tolist()
    df["cluster "+str(i)]= df["cluster "+str(i)].apply(lambda x : x[0])

In [None]:
for i in range(len(clusters)):
    print(np.log(df['score']+1).corr(df["cluster "+str(i)]))