In [7]:
import numpy
import pickle as pk
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.spatial.distance import cosine
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [8]:
os.chdir('/home/jcai/geometry_of_law/data_and_dictionary')

In [9]:
citation_data = pd.read_stata('/home/jcai/geometry_of_law/data_and_dictionary/caseid_citecounts.dta')

In [10]:
citation_data = citation_data.dropna()

In [11]:
citation_data["logcounts"] = [np.log(1+citation_data["within"][i]+citation_data["outside"][i]) for i in citation_data.index]

# build a data frame of caseid-citecount-docvector

In [12]:
model = Doc2Vec.load('/home/jcai/geometry_of_law/doc2vec_v50k_d200_shuffled_opinion/ALL_opinion.d2v')
set_of_doc_names = set(model.docvecs.doctags)

In [13]:
master_dataframe = pd.read_csv("/home/jcai/geometry_of_law/Encyclopedia Entry/master_dataframe.csv")

In [14]:
master_dataframe["caseid"] = [x.split("_")[0] for x in master_dataframe.copy()['docname']]

In [15]:
master_dataframe.shape

(292765, 16)

In [16]:
master_dataframe = master_dataframe[master_dataframe['docname'].isin(set_of_doc_names)]

In [17]:
citation_data = citation_data[citation_data["caseid"].isin(set(master_dataframe['caseid'].values))]

In [18]:
gb_caseid = master_dataframe.groupby("caseid")

In [19]:
def return_mean_d2v(caseid):
    doc_names = gb_caseid.get_group(caseid)["docname"].values
    return np.mean([model[i] for i in doc_names],axis=0)

In [20]:
CBY_average_dict = pk.load(open("CBY_average_dict.p","rb"))

In [21]:
CY_average_dict = pk.load(open("CY_average_dict.p","rb"))

In [22]:
def return_demean_mean_d2v(caseid):
    doc_names = gb_caseid.get_group(caseid)["docname"].values
    cy = gb_caseid.get_group(caseid)["circuit-year"].values
    v_list = [model[doc_names[i]] - CY_average_dict[cy[i]] for i in range(len(doc_names))]
    return np.mean(v_list,axis=0)

In [23]:
citation_data['d2v'] = [return_mean_d2v(citation_data['caseid'][i]) for i in citation_data.index]

In [24]:
citation_data['d2v_dm'] = [return_demean_mean_d2v(citation_data['caseid'][i]) for i in citation_data.index]

In [25]:
citation_data = citation_data.dropna()

# build features

In [26]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

In [27]:
d2v_X = np.asarray(list(citation_data["d2v"]))

In [28]:
y = citation_data["logcounts"]

## Train-test split

In [29]:
d2v_X_train, d2v_X_test, y_train, y_test = train_test_split(d2v_X, y, test_size=0.2)

## some linear models

## Linear Regression

In [30]:
from sklearn.metrics import explained_variance_score

In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
LR = LinearRegression(fit_intercept=False, normalize=False)

In [33]:
LR_reg = LR.fit(d2v_X_train, y_train)
print('The training R2 is', LR_reg.score(d2v_X_train, y_train))
print('The test R2 is', LR_reg.score(d2v_X_test, y_test))
print('The explained variance is', explained_variance_score(y_test, LR_reg.predict(d2v_X_test)))

The training R2 is 0.23164420965147914
The test R2 is 0.2298021251381763
The explained variance is 0.22986817989712038


## Elasticnet with CV

In [34]:
from sklearn.linear_model import ElasticNetCV

In [35]:
elastic = ElasticNetCV(random_state=0,l1_ratio=[.01, .5, .7, .95, .99],fit_intercept=True)

In [36]:
elastic_reg = elastic.fit(d2v_X_train, y_train)
print('The training R2 is', elastic_reg.score(d2v_X_train, y_train))
print('The test R2 is', elastic_reg.score(d2v_X_test, y_test))
print('The explained variance is', explained_variance_score(y_test, elastic_reg.predict(d2v_X_test)))

The training R2 is 0.240614041470539
The test R2 is 0.23924789670195312
The explained variance is 0.23924870298989764


In [37]:
print('The L1 panelty is', elastic_reg.l1_ratio_)

The L1 panelty is 0.99


## Lasso

In [38]:
from sklearn.linear_model import Lasso

In [39]:
lasso = Lasso(alpha=1.0,fit_intercept=True)

In [40]:
lasso_reg = lasso.fit(d2v_X_train, y_train)
print('The training R2 is', lasso.score(d2v_X_train, y_train))
print('The test R2 is', lasso.score(d2v_X_test, y_test))
print('The explained variance is', explained_variance_score(y_test, lasso.predict(d2v_X_test)))

The training R2 is -1.509903313490213e-14
The test R2 is -9.076688736175953e-06
The explained variance is 0.0


In [41]:
from math import exp

In [42]:
for alpha in [exp(i) for i in range(-10,1)]:
    print(" ")
    print("The alpha is ", alpha)
    lasso = Lasso(alpha=alpha,fit_intercept=True,max_iter=5000)
    lasso_reg = lasso.fit(d2v_X_train, y_train)
    print('The training R2 is', lasso.score(d2v_X_train, y_train))
    print('The test R2 is', lasso.score(d2v_X_test, y_test))
    print('The explained variance is', explained_variance_score(y_test, lasso.predict(d2v_X_test)))

 
The alpha is  4.5399929762484854e-05
The training R2 is 0.24063442330438345
The test R2 is 0.23925448982612618
The explained variance is 0.23925531397789201
 
The alpha is  0.00012340980408667956
The training R2 is 0.24062513476669037
The test R2 is 0.23925343558808165
The explained variance is 0.2392542491427474
 
The alpha is  0.00033546262790251185
The training R2 is 0.24055721090460214
The test R2 is 0.23920851222500938
The explained variance is 0.239209300214693
 
The alpha is  0.0009118819655545162
The training R2 is 0.24016851556985533
The test R2 is 0.2388980054532599
The explained variance is 0.23889870625530463
 
The alpha is  0.0024787521766663585
The training R2 is 0.23828832277527212
The test R2 is 0.23715009818678245
The explained variance is 0.23715070235497382
 
The alpha is  0.006737946999085467
The training R2 is 0.2297239586542532
The test R2 is 0.22888608581792302
The explained variance is 0.22888674744533877
 
The alpha is  0.01831563888873418
The training R2 is 

## non-linear models

In [None]:
#from sklearn.neighbors import KNeighborsRegressor

In [None]:
#for n in [8,9,10,11,12,13,14]:
#    print("n = ",n)
#    knn = KNeighborsRegressor(n_neighbors=n)
#    knn_reg = knn.fit(d2v_X_train,y_train)
#    print('The training R2 is', knn_reg.score(d2v_X_train, y_train))
#    print('The test R2 is', knn_reg.score(d2v_X_test, y_test))
#    print('The explained variance is', explained_variance_score(y_test, knn_reg.predict(d2v_X_test)))

## trees and forests

In [43]:
from sklearn.model_selection import ParameterGrid,GridSearchCV

## Gradient Boosting

In [46]:
from sklearn.model_selection import cross_val_predict

In [44]:
from sklearn.ensemble import GradientBoostingRegressor

  from numpy.core.umath_tests import inner1d


In [45]:
gdb = GradientBoostingRegressor()
gdb_reg = gdb.fit(d2v_X_train, y_train)
print('The training R2 is', gdb_reg.score(d2v_X_train, y_train))
print('The test R2 is', gdb_reg.score(d2v_X_test, y_test))
print('The explained variance is', explained_variance_score(y_test, gdb_reg.predict(d2v_X_test)))

The training R2 is 0.210513424857488
The test R2 is 0.19708792858609814
The explained variance is 0.19708794409096764


In [None]:
param_vals = {'max_depth': [2, 5, 10], "n_estimators": [50,100,200], "learning_rate" : [0.01, 0.05, 0.1]}
gdb = GradientBoostingRegressor()
gdb_grid_cv = GridSearchCV(gdb, param_vals)

gdb_grid_cv_reg = gdb_grid_cv.fit(d2v_X_train, y_train)
print('The training R2 is', gdb_grid_cv_reg.score(d2v_X_train, y_train))
print('The test R2 is', gdb_grid_cv_reg.score(d2v_X_test, y_test))
print('The explained variance is', explained_variance_score(y_test, gdb_grid_cv_reg.predict(d2v_X_test)))

In [None]:
gdb_grid_cv_reg.best_estimator_

In [None]:
citation_data = citation_data[["caseid","logcounts","predict_logcounts"]]
citation_data.to_csv("citation_data.csv")

## a neural network

In [57]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, AveragePooling1D

In [58]:
model = Sequential()
model.add(Dense(128, input_dim=200,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='linear'))

In [59]:
# Compile the network :
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_28 (Dense)             (None, 128)               25728     
_________________________________________________________________
dense_29 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_30 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_31 (Dense)             (None, 128)               16512     
_________________________________________________________________
dropout_9 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_32 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_33 (Dense)             (None, 128)               16512     
__________

In [60]:
model.fit(d2v_X_train, y_train, epochs=1000, batch_size=32, validation_split = 0.2,verbose = False)

Instructions for updating:
Use tf.cast instead.


KeyboardInterrupt: 

In [None]:
predictions = model.predict(d2v_X_test)

In [None]:
print('The explained variance is', explained_variance_score(y_test, predictions))

In [61]:
lasso = Lasso(alpha=exp(-10),fit_intercept=True,max_iter=5000)
lasso.fit(d2v_X_train, y_train)
print('The training R2 is', lasso.score(d2v_X_train, y_train))
print('The test R2 is', lasso.score(d2v_X_test, y_test))
print('The explained variance is', explained_variance_score(y_test, lasso.predict(d2v_X_test)))

The training R2 is 0.24063442330438345
The test R2 is 0.23925448982612618
The explained variance is 0.23925531397789201


In [65]:
print('The explained variance is', explained_variance_score(y, lasso.predict(d2v_X)))

The explained variance is 0.24035827761292627


In [66]:
citation_data["predict_logcounts"] = lasso.predict(d2v_X)

In [68]:
citation_data.columns

Index(['caseid', 'within', 'outside', 'logcounts', 'd2v', 'd2v_dm',
       'predict_logcounts'],
      dtype='object')

In [69]:
citation_data = citation_data[['caseid','within', 'outside',"logcounts","predict_logcounts"]]
citation_data.to_csv("citation_data.csv")

In [3]:
import pandas as pd

In [5]:
citation_data = pd.read_csv("/home/jcai/geometry_of_law/data_and_dictionary/citation_data.csv")

In [70]:
citation_data

Unnamed: 0,caseid,within,outside,logcounts,predict_logcounts
0,X101L14003,1.0,0.0,0.693147,0.528095
1,X10284I003,0.0,0.0,0.000000,2.450891
2,X102UPO003,11.0,3.0,2.708050,2.361321
14,X10379GN,0.0,0.0,0.000000,1.359573
16,X1037E8N,1.0,0.0,0.693147,1.457104
23,X1037G8N,0.0,0.0,0.000000,1.538895
25,X1037H8N,0.0,0.0,0.000000,1.739403
29,X1037LGN,0.0,0.0,0.000000,1.377140
34,X1037P0N,0.0,0.0,0.000000,1.150165
35,X1037P8N,0.0,0.0,0.000000,1.235451
