In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import re

from multiprocessing import cpu_count, Pool #for multiprocessing data
cores = cpu_count()

from gensim.models.doc2vec import Doc2Vec, TaggedDocument #for doc2vec modelling
from IPython.display import clear_output

from tqdm import tqdm

import pickle

from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

### Import

In [2]:
df = pd.read_pickle('content_2.0.pkl')

In [3]:
df['year'] = df.decision_date.apply(lambda x : int(re.findall('^\d{4}', x)[0]))

### Cleaning

In [4]:
def clean(df):
    df['content_clean'] = df['content'].apply(lambda x: re.sub(re.compile('\.'), ' ', x))
    df['content_clean'] = df['content_clean'].apply(lambda x: re.sub(re.compile('<.*?>'), ' ', x))
    df['content_clean'] = df['content_clean'].apply(lambda x: re.sub(re.compile('\r?\n|\r'), ' ', x))
    df['content_clean'] = df['content_clean'].apply(lambda x: re.sub(re.compile('^.*?\:'), ' ', x))
    df['content_clean'] = df['content_clean'].apply(lambda x: re.sub(re.compile('(^|\s).(\s|$):'), ' ', x))
    df['content_clean'] = df['content_clean'].apply(lambda x: re.sub(re.compile('\s*[A-Z]+\s'), ' ', x))
    df['content_clean'] = df['content_clean'].apply(lambda x: re.sub(re.compile('GR\s|No\s|January|February|March|April|May|June|July|August|September|October|November|December'), ' ', x))
    df['content_clean'] = df['content_clean'].apply(lambda x: re.sub(re.compile('[^a-zA-Z -]'), ' ', x))
    df['content_clean'] = df['content_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words("english"))]))
    df['content_clean'] = df['content_clean'].apply(lambda x: x.lower())
    df['content_clean'] = df['content_clean'].apply(lambda x: x.split())
    return df

def parallel_df_process(df, func, n_cores=cores):
    pool = Pool(n_cores)
    df_split = np.array_split(df, n_cores)
    df_joined = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df_joined

In [5]:
df_cleaned = parallel_df_process(df, clean)

In [6]:
print(df_cleaned.content_clean.iloc[0][:15])

['it', 'established', 'fact', 'brooks', 'obtained', 'absolute', 'discharge', 'soldier', 'it', 'likewise', 'fact', 'explicitly', 'stated', 'counsel', 'government']


### Stemming

In [7]:
porter = PorterStemmer()

def stem_df(df):
    df['stemmed'] = df['content_clean'].apply(lambda x : [porter.stem(word) for word in x])
    return df

In [8]:
df_cleaned = parallel_df_process(df_cleaned, stem_df)

In [9]:
print(df_cleaned.stemmed.iloc[0][:15])

['it', 'establish', 'fact', 'brook', 'obtain', 'absolut', 'discharg', 'soldier', 'it', 'likewis', 'fact', 'explicitli', 'state', 'counsel', 'govern']


### Year Selection

In [10]:
year = 1900
main_df = df_cleaned[(df.year >= year) & (df.label != "Part I") & (df.label != "Part II") & (df.label != "Part III") & (df.label != "Part IV")]

In [11]:
main_df.head()

Unnamed: 0,id,content,case_title,gr_no,decision_date,label,year,content_clean,stemmed
8,143051.0,"<center>\n<h2></h2>\n<h2>G.R. No. 507, Novembe...",IN THE MATTER OF THE PETITION OF A. O. BROOKS ...,G.R. No. 507,1901-11-05,Remedial Law,1901,"[it, established, fact, brooks, obtained, abso...","[it, establish, fact, brook, obtain, absolut, ..."
19,143062.0,"<center>\n<h2></h2>\n<h2>G.R. No. 448, Septemb...","THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",G.R. No. 448,1901-09-20,Criminal Law,1901,"[the, offense, charged, complaint, punishable,...","[the, offens, charg, complaint, punish, penal,..."
126,143169.0,"<center>\n<h2></h2>\n<h2>G.R. No. 448, Septemb...","THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",G.R. No. 448,1902-09-20,Criminal Law,1902,"[the, offense, charged, complaint, punishable,...","[the, offens, charg, complaint, punish, penal,..."
209,143252.0,"<center>\n<h2></h2>\n<h2>G.R. No. 911, March 1...","MAXIMO CORTES, PLAINTIFF AND APPELLANT, VS. J...",G.R. No. 911,1903-03-12,Civil Law,1903,"[this, suit, brought, obtain, injunction, acco...","[thi, suit, brought, obtain, injunct, accord, ..."
226,143269.0,"<center>\n<h2></h2>\n<h2>G.R. No. 1011, May 13...","JOSE MACHUCA, PLAINTIFF AND APPELLEE, VS. CHUI...",G.R. No. 1011,1903-05-13,Civil Law,1903,"[most, allegations, complaint, admitted, defen...","[most, alleg, complaint, admit, defend, hear, ..."


In [12]:
print('Length:', main_df.__len__())

Length: 3582


### Pickle main_df

In [13]:
main_df.to_pickle("main_df.pkl")

### Doc2Vec

In [14]:
print('Tagging docs...')
docs = [x for x in main_df['stemmed'].values]
# docs = [x for x in main_df['content_clean'].values]
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]
print('Docs tagged.')
# tqdm.pandas(desc="progress-bar")

print('Training start.')
# epoch_logger = EpochLogger()
epochs = 50
vec_size = 50
alpha = 0.025

print('Initializing Model...')
model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=2,
                workers = cores,
                dm = 1, epochs = epochs)
#                 callbacks=[epoch_logger])
                

print('Model Initalized. Training...')
model.build_vocab(documents = documents, progress_per=1)
print('Training complete.')

print('Saving model...')
model.save("d2vmodel/d2vlegaldocs.model")
print("Model Saved")

Tagging docs...
Docs tagged.
Training start.
Initializing Model...
Model Initalized. Training...
Training complete.
Saving model...
Model Saved


In [15]:
vectors = [model.docvecs[i] for i in range(len(main_df))]
main_df['vector_d2v'] = vectors

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
main_df.head()

Unnamed: 0,id,content,case_title,gr_no,decision_date,label,year,content_clean,stemmed,vector_d2v
8,143051.0,"<center>\n<h2></h2>\n<h2>G.R. No. 507, Novembe...",IN THE MATTER OF THE PETITION OF A. O. BROOKS ...,G.R. No. 507,1901-11-05,Remedial Law,1901,"[it, established, fact, brooks, obtained, abso...","[it, establish, fact, brook, obtain, absolut, ...","[0.0041221147, 0.00030034626, -0.009949138, 0...."
19,143062.0,"<center>\n<h2></h2>\n<h2>G.R. No. 448, Septemb...","THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",G.R. No. 448,1901-09-20,Criminal Law,1901,"[the, offense, charged, complaint, punishable,...","[the, offens, charg, complaint, punish, penal,...","[-0.0028990796, 0.00927604, 0.003277776, 0.007..."
126,143169.0,"<center>\n<h2></h2>\n<h2>G.R. No. 448, Septemb...","THE UNITED STATES, COMPLAINANT AND APPELLEE, V...",G.R. No. 448,1902-09-20,Criminal Law,1902,"[the, offense, charged, complaint, punishable,...","[the, offens, charg, complaint, punish, penal,...","[0.0034304094, 0.0037076662, 0.0009457428, -0...."
209,143252.0,"<center>\n<h2></h2>\n<h2>G.R. No. 911, March 1...","MAXIMO CORTES, PLAINTIFF AND APPELLANT, VS. J...",G.R. No. 911,1903-03-12,Civil Law,1903,"[this, suit, brought, obtain, injunction, acco...","[thi, suit, brought, obtain, injunct, accord, ...","[-0.008524174, 0.00065538724, -0.0066927285, -..."
226,143269.0,"<center>\n<h2></h2>\n<h2>G.R. No. 1011, May 13...","JOSE MACHUCA, PLAINTIFF AND APPELLEE, VS. CHUI...",G.R. No. 1011,1903-05-13,Civil Law,1903,"[most, allegations, complaint, admitted, defen...","[most, alleg, complaint, admit, defend, hear, ...","[-0.008267059, -0.008490125, 0.0056070876, -0...."


In [18]:
main_df.to_pickle('content_d2v_v2.0.pkl')

# Training

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [40]:
train_df = pd.read_pickle('content_d2v_v2.0.pkl')

In [46]:
def get_X(col):
    li = []
    for i in col:
        li.append(i.reshape(1, -1))
    return np.concatenate(li, axis = 0)

In [47]:
X_sub = train_df['vector_d2v']
X = get_X(X_sub)
y = train_df['label']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=49)

### 1.) SVM

In [49]:
clf = svm.SVC()
clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [26]:
y_pred_svm = clf.predict(X_test)

In [27]:
print(classification_report(y_pred_svm, y_test))

                       precision    recall  f1-score   support

            Civil Law       0.97      0.22      0.36       700
       Commercial Law       0.00      0.00      0.00         0
         Criminal Law       0.00      0.00      0.00         1
                Labor       0.00      0.00      0.00         6
         Legal Method       0.00      0.00      0.00         0
Medical Jurisprudence       0.00      0.00      0.00         0
        Political Law       0.01      0.17      0.02         6
         Remedial Law       0.01      0.25      0.02         4
             Taxation       0.00      0.00      0.00         0
       Transportation       0.00      0.00      0.00         0

             accuracy                           0.22       717
            macro avg       0.10      0.06      0.04       717
         weighted avg       0.95      0.22      0.35       717



  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_svm))

Accuracy: 0.2203626220362622


### 2.) Random Forest

In [29]:
rf = RandomForestClassifier(max_depth=50, random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=50, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [30]:
y_pred_rf = rf.predict(X_test)

In [31]:
print(classification_report(y_pred_rf, y_test))

                       precision    recall  f1-score   support

            Civil Law       0.82      0.22      0.34       611
       Commercial Law       0.01      0.14      0.02         7
         Criminal Law       0.00      0.00      0.00         9
                Labor       0.05      0.14      0.07        29
         Legal Method       0.00      0.00      0.00         0
Medical Jurisprudence       0.00      0.00      0.00         0
        Political Law       0.03      0.09      0.05        34
         Remedial Law       0.01      0.04      0.01        27
             Taxation       0.00      0.00      0.00         0
       Transportation       0.00      0.00      0.00         0

             accuracy                           0.20       717
            macro avg       0.09      0.06      0.05       717
         weighted avg       0.70      0.20      0.30       717



  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))

Accuracy: 0.19665271966527198


### 3.) Decision Tree

In [33]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [34]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [35]:
y_pred_dt = clf.predict(X_test)