In [1]:
import numpy
import pickle as pk
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.spatial.distance import cosine
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
os.chdir('/home/jcai/geometry_of_law/data_and_dictionary')

In [3]:
citation_data = pd.read_stata('/home/jcai/geometry_of_law/data_and_dictionary/caseid_citecounts.dta')

In [4]:
citation_data = citation_data.dropna()

In [5]:
citation_data["logcounts"] = [np.log(1+citation_data["within"][i]+citation_data["outside"][i]) for i in citation_data.index]

# build a data frame of caseid-citecount-docvector

In [6]:
model = Doc2Vec.load('/home/jcai/geometry_of_law/doc2vec_v50k_d200_shuffled_opinion/ALL_opinion.d2v')
set_of_doc_names = set(model.docvecs.doctags)

In [7]:
master_dataframe = pd.read_csv("/home/jcai/geometry_of_law/Encyclopedia Entry/master_dataframe.csv")

In [8]:
master_dataframe["caseid"] = [x.split("_")[0] for x in master_dataframe.copy()['docname']]

In [9]:
master_dataframe.shape

(292765, 16)

In [10]:
master_dataframe = master_dataframe[master_dataframe['docname'].isin(set_of_doc_names)]

In [11]:
citation_data = citation_data[citation_data["caseid"].isin(set(master_dataframe['caseid'].values))]

In [12]:
gb_caseid = master_dataframe.groupby("caseid")

In [14]:
def return_mean_d2v(caseid):
    doc_names = gb_caseid.get_group(caseid)["docname"].values
    return np.mean([model[i] for i in doc_names],axis=0)

In [19]:
CBY_average_dict = pk.load(open("CBY_average_dict.p","rb"))

In [49]:
CY_average_dict = pk.load(open("CY_average_dict.p","rb"))

In [68]:
def return_demean_mean_d2v(caseid):
    doc_names = gb_caseid.get_group(caseid)["docname"].values
    cy = gb_caseid.get_group(caseid)["circuit-year"].values
    v_list = [model[doc_names[i]] - CY_average_dict[cy[i]] for i in range(len(doc_names))]
    return np.mean(v_list,axis=0)

In [23]:
citation_data['d2v'] = [return_mean_d2v(citation_data['caseid'][i]) for i in citation_data.index]

In [69]:
citation_data['d2v_dm'] = [return_demean_mean_d2v(citation_data['caseid'][i]) for i in citation_data.index]

In [70]:
citation_data = citation_data.dropna()

# build features

In [71]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

In [79]:
d2v_X = np.asarray(list(citation_data["d2v"]))

In [80]:
y = citation_data["logcounts"]

## Train-test split

In [81]:
d2v_X_train, d2v_X_test, y_train, y_test = train_test_split(d2v_X, y, test_size=0.2)

## some linear models

## Linear Regression

In [82]:
from sklearn.metrics import explained_variance_score

In [83]:
from sklearn.linear_model import LinearRegression

In [84]:
LR = LinearRegression(fit_intercept=False, normalize=False)

In [85]:
LR_reg = LR.fit(d2v_X_train, y_train)
print('The training R2 is', LR_reg.score(d2v_X_train, y_train))
print('The test R2 is', LR_reg.score(d2v_X_test, y_test))
print('The explained variance is', explained_variance_score(y_test, LR_reg.predict(d2v_X_test)))

The training R2 is 0.23088813479493653
The test R2 is 0.23256790152927245
The explained variance is 0.23275066713627213


## Elasticnet with CV

In [86]:
from sklearn.linear_model import ElasticNetCV

In [87]:
elastic = ElasticNetCV(random_state=0,l1_ratio=[.01, .5, .7, .95, .99],fit_intercept=True)

In [88]:
elastic_reg = elastic.fit(d2v_X_train, y_train)
print('The training R2 is', elastic_reg.score(d2v_X_train, y_train))
print('The test R2 is', elastic_reg.score(d2v_X_test, y_test))
print('The explained variance is', explained_variance_score(y_test, elastic_reg.predict(d2v_X_test)))

The training R2 is 0.23988931915602107
The test R2 is 0.24189872324905223
The explained variance is 0.24192301405309102


In [89]:
print('The L1 panelty is', elastic_reg.l1_ratio_)

The L1 panelty is 0.99


## Lasso

In [90]:
from sklearn.linear_model import Lasso

In [91]:
lasso = Lasso(alpha=1.0,fit_intercept=True)

In [92]:
lasso_reg = lasso.fit(d2v_X_train, y_train)
print('The training R2 is', lasso.score(d2v_X_train, y_train))
print('The test R2 is', lasso.score(d2v_X_test, y_test))
print('The explained variance is', explained_variance_score(y_test, lasso.predict(d2v_X_test)))

The training R2 is 1.1102230246251565e-16
The test R2 is -4.8963091789477886e-05
The explained variance is 0.0


In [93]:
from math import exp

In [94]:
for alpha in [exp(i) for i in range(-10,1)]:
    print(" ")
    print("The alpha is ", alpha)
    lasso = Lasso(alpha=alpha,fit_intercept=True,max_iter=10000)
    lasso_reg = lasso.fit(d2v_X_train, y_train)
    print('The training R2 is', lasso.score(d2v_X_train, y_train))
    print('The test R2 is', lasso.score(d2v_X_test, y_test))
    print('The explained variance is', explained_variance_score(y_test, lasso.predict(d2v_X_test)))

 
The alpha is  4.5399929762484854e-05
The training R2 is 0.23990952601238993
The test R2 is 0.2419145343591822
The explained variance is 0.24193853381805597
 
The alpha is  0.00012340980408667956
The training R2 is 0.2399003954185679
The test R2 is 0.24190797036523237
The explained variance is 0.24193214227924864
 
The alpha is  0.00033546262790251185
The training R2 is 0.23983693313127397
The test R2 is 0.2418575475909156
The explained variance is 0.2418821796796442
 
The alpha is  0.0009118819655545162
The training R2 is 0.23943589765333873
The test R2 is 0.24150919545103322
The explained variance is 0.24153503346061211
 
The alpha is  0.0024787521766663585
The training R2 is 0.23747237493389928
The test R2 is 0.23950334370615978
The explained variance is 0.23953098142576068
 
The alpha is  0.006737946999085467
The training R2 is 0.2290857398338986
The test R2 is 0.23059167127714705
The explained variance is 0.23062165701690862
 
The alpha is  0.01831563888873418
The training R2 is 

## non-linear models

In [95]:
#from sklearn.neighbors import KNeighborsRegressor

In [96]:
#for n in [8,9,10,11,12,13,14]:
#    print("n = ",n)
#    knn = KNeighborsRegressor(n_neighbors=n)
#    knn_reg = knn.fit(d2v_X_train,y_train)
#    print('The training R2 is', knn_reg.score(d2v_X_train, y_train))
#    print('The test R2 is', knn_reg.score(d2v_X_test, y_test))
#    print('The explained variance is', explained_variance_score(y_test, knn_reg.predict(d2v_X_test)))

## trees and forests

In [97]:
from sklearn.model_selection import ParameterGrid,GridSearchCV

## Gradient Boosting

In [98]:
from sklearn.ensemble import GradientBoostingRegressor

  from numpy.core.umath_tests import inner1d


In [99]:
gdb = GradientBoostingRegressor()
gdb_reg = gdb.fit(d2v_X_train, y_train)
print('The training R2 is', gdb_reg.score(d2v_X_train, y_train))
print('The test R2 is', gdb_reg.score(d2v_X_test, y_test))
print('The explained variance is', explained_variance_score(y_test, gdb_reg.predict(d2v_X_test)))

The training R2 is 0.20992187024656095
The test R2 is 0.19845500041478037
The explained variance is 0.19849513356811954


In [None]:
param_vals = {'max_depth': [2, 5, 10], "n_estimators": [50,100,200], "learning_rate" : [0.01, 0.05, 0.1]}
gdb = GradientBoostingRegressor()
gdb_grid_cv = GridSearchCV(gdb, param_vals)

gdb_grid_cv_reg = gdb_grid_cv.fit(d2v_X_train, y_train)
print('The training R2 is', gdb_grid_cv_reg.score(d2v_X_train, y_train))
print('The test R2 is', gdb_grid_cv_reg.score(d2v_X_test, y_test))
print('The explained variance is', explained_variance_score(y_test, gdb_grid_cv_reg.predict(d2v_X_test)))

In [None]:
gdb_grid_cv_reg.best_estimator_

## a neural network

In [101]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, AveragePooling1D

Using TensorFlow backend.


In [102]:
model = Sequential()
model.add(Dense(128, input_dim=200,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dropout(rate = 0.05))
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='linear'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [103]:
# Compile the network :
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               25728     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_6 (Dense)              (None, 128)               16512     
__________

In [None]:
model.fit(d2v_X_train, y_train, epochs=1000, batch_size=32, validation_split = 0.2,verbose = False)

In [None]:
predictions = model.predict(d2v_X_test)

In [None]:
print('The explained variance is', explained_variance_score(y_test, predictions))

In [106]:
citation_data["predict_logcounts"] = gdb_reg.predict(d2v_X)

In [107]:
citation_data = citation_data[["logcounts","predict_logcounts"]]
citation_data.to_csv("citation_data.csv")

In [108]:
citation_data

Unnamed: 0,logcounts,predict_logcounts
0,0.693147,1.563644
1,0.000000,2.578287
2,2.708050,1.955208
14,0.000000,1.652018
16,0.693147,1.709437
23,0.000000,1.696154
25,0.000000,1.745818
29,0.000000,1.442154
34,0.000000,1.798953
35,0.000000,1.767056
