In [None]:
##Court of Appeals for the Seventh Circuit Analysis
##Raw data is available from: https://www.courtlistener.com
##Analysis was done by looking at over 40k documents from the Seventh Circuit

In [None]:
import pandas as pd
import gensim
import json
import nltk
import re
import csv
import string
import os
import codecs
from sklearn import feature_extraction

In [None]:
#Code inspired/borrowed from:
    #https://www.kaggle.com/c/word2vec-nlp-tutorial
    #http://brandonrose.org/clustering
    #http://opensource.datacratic.com/mtlpy50/

In [None]:
# load nltk's English stopwords
stopwords = nltk.corpus.stopwords.words('english')

def cleanraj( review, remove_stopwords=False ):
    #Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review)

    #Convert words to lower case and split them
    words = review_text.lower().split()

    #Optionally remove stop words (false by default)
    if remove_stopwords:
        #stops = set(stopwords.words("english"))
        #stops.update(['test'])
        #words = [w for w in words if not w in stops]
        words = [w for w in words if not w in stopwords]
    #
    # 5. Return a list of words
    return(words)

In [None]:
### Script to consolidate all the plain text parts
%cd /Users/rajivshah/Documents/Documents/Projects/SeventhCircuit/data/ca7
case =[]
data=[]
text=[]
content=[]
sentences=[]
test=[]
import glob
files = glob.glob( '*.json' )

with open( 'result.txt', 'a' ) as result:
    for file_ in files:
        with open( file_, 'r' ) as infile:
            text = infile.read().decode('utf-8','replace')
            data = json.loads(text)
            case = data['plain_text']
            tokens = nltk.word_tokenize(case)
            for i in tokens:
                temp = cleanraj(i,remove_stopwords=True)
                if len(temp) != 0:
                    test.append(temp)
            #result.write("%s|" % test) ##If you want to save the results of your work
            content.append(test)
            test = []
result.close()

In [None]:
len(content)

In [None]:
##If you read it from a file, optional
#content=[]
#with open('result.txt', 'r') as f:
#    content = f.readline().split('|')
#f.closed

In [None]:
##Parse the content into sentences
sentences = []  # Initialize an empty list of sentences
for i in content:
    sentences += i

In [None]:
len(sentences)
# 7063681

##WORD2VEC

In [None]:
%time model = gensim.models.Word2Vec(sentences, min_count=400)
print(model)
len(model.syn0)
model.most_similar("court")

In [None]:
#Keep model in memory
model.init_sims(replace=True)
#Save Model
model_name="first-3526"
#model.save(model_name)
#model.save_word2vec_format(model_name + '-b.txt', binary=False)

In [None]:
word2vec_dict={}
for i in model.vocab.keys():
    try:
        word2vec_dict[i]=model[i]
    except:
        pass

##TSNE

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

In [None]:
X = np.array(word2vec_dict.values(), dtype = np.float64)
Xmodel = TSNE(n_components=2, verbose=2, n_iter=200)
t = Xmodel.fit_transform(X)

In [None]:
##KMeans Analysis

In [None]:
##KMeans
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# Set "k" (num_clusters) 
word_vectors = model.syn0
num_clusters = 20

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print "Time taken for K Means clustering: ", elapsed, "seconds."

word_centroid_map=dict(zip(model.index2word,idx))

for cluster in xrange(0,num_clusters):
    print ("\nCluster %d" % cluster)
    words = []
    for i in xrange(0,len(word_centroid_map.values())):
        if (word_centroid_map.values()[i] == cluster):
            words.append(word_centroid_map.keys()[i])
    print (words)

In [None]:
###Create plot in matplotlib - BORING

#from pylab import rcParams
#rcParams['figure.figsize']=15,15

#Creates plot
#N = len(word2vec_dict)
#labels=[word2vec_dict.keys()[i] for i in N]
#for i in range(0,N):
#    str = (word2vec_dict.keys()[i])
#    y = word_centroid_map[str]
#    labelnum.append(y)

#plt.scatter(t[:, 0], t[:, 1])
#index=0
#for label, x, y in zip(labels, t[:, 0], t[:, 1]):
#    plt.annotate(
#        label, 
#        xy = (x, y), xytext = (-20, 20),
#        textcoords = 'offset points', ha = 'right', va = 'bottom',
#        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
#        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

#plt.show()

In [None]:
import bokeh
from bokeh.plotting import *
from bokeh.models import HoverTool 
output_notebook()

In [None]:
##Sets up labels and colors
N = len(word2vec_dict)
labels=[word2vec_dict.keys()[i] for i in range(N)]
labelnum=[]
temp =[]
str=[]
for i in range(0,N):
    str = (word2vec_dict.keys()[i])
    temp = word_centroid_map[str]
    labelnum.append(temp)

In [None]:
##Label Colors
LABEL_COLOR_MAP = {0 : 'steelblue',
                   1 : 'aqua',
                   2 : 'black',
                   3 : 'brown',
                   4 : 'coral',
                   5 : 'darkgray',
                   6 : 'gold',
                   7 : 'indianred',
                   8 : 'lemonchiffon',
                   9 : 'maroon',
                   10 : 'olivedrab',
                   11 : 'pink',
                   12 : 'plum',
                   13 : 'steelblue',
                   14 : 'k',
                   15 : 'silver',
                   16 : 'skyblue',
                   17 : 'tan',
                   18 : 'teal',
                    19: 'yellowgreen'
                   }

label_color = [LABEL_COLOR_MAP[l] for l in labelnum]

In [None]:
##Creates tsne visualization
output_file("tsne.html")
p = figure(plot_width=700, plot_height=700, title="Court of Appeals for the Seventh Circuit",
       tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
       x_axis_type=None, y_axis_type=None, min_border=1)
source = ColumnDataSource({"names":labels})
p.scatter(t[:,0], t[:,1], marker="circle",source=source,
          color=label_color,
           # line_color="#6666ee", fill_color="#ee6666", 
          fill_alpha=0.5, size=12).select(dict(type=HoverTool)).tooltips = {"/r/":"@names"}
#p.text(t[:,0], t[:,1], labels, text_font_size="9pt", text_align="center", text_baseline="middle")
save(p)
#show(p)