In [23]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt


In [24]:
from IPython.display import Markdown, display

In [25]:
data_path = "/Users/sakthivel/lexisnexis/ciatations.csv"

In [5]:
#read the input file
data_all= pd.read_csv(data_path)


In [26]:
#this column called "new" is added to the exising data , where it contains only the text before the citations are refered.
data_all['new'] = data_all[['tocase','text']].apply(lambda x: re.search(r'(.*{})'.format(x[0]), x[1]).group(0) if x[0] in x[1] else 'No Text matching', axis=1)


In [27]:
# filter out the dataframe with only needed four citation class
subsetDataFrame = data_all[data_all['class'].isin(['cited', 'referred_to','applied','followed'])]

In [28]:
# convert the values in the column to column header for classification
dummy=pd.get_dummies(subsetDataFrame['class'])

In [29]:
new_dataframe=pd.concat([subsetDataFrame,dummy],axis=1)
new_dataframe.head()

Unnamed: 0,filename,citation id,class,tocase,text,new,applied,cited,followed,referred_to
0,06_1.xml,c0,cited,Universal Music Australia Pty Ltd v Sharman Li...,2 Wilcox J delivered judgment on the complex i...,2 Wilcox J delivered judgment on the complex i...,0,1,0,0
1,06_1.xml,c1,cited,Universal Music Australia Pty Ltd v Sharman Li...,2 Wilcox J delivered judgment on the complex i...,2 Wilcox J delivered judgment on the complex i...,0,1,0,0
2,06_1.xml,c2,cited,Universal Music Australia Pty Ltd v Sharman Li...,2 Wilcox J delivered judgment on the complex i...,2 Wilcox J delivered judgment on the complex i...,0,1,0,0
3,06_1.xml,c3,cited,Sharman License Holdings Ltd v Universal Music...,2 Wilcox J delivered judgment on the complex i...,2 Wilcox J delivered judgment on the complex i...,0,1,0,0
4,06_1.xml,c4,cited,Sharman License Holdings Ltd v Universal Music...,2 Wilcox J delivered judgment on the complex i...,2 Wilcox J delivered judgment on the complex i...,0,1,0,0


In [30]:
# class labels are filtered only for the following columns
class_labels = new_dataframe[["applied", "cited", "followed", "referred_to"]]
class_labels.head()
class_labels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1046 entries, 0 to 1198
Data columns (total 4 columns):
applied        1046 non-null uint8
cited          1046 non-null uint8
followed       1046 non-null uint8
referred_to    1046 non-null uint8
dtypes: uint8(4)
memory usage: 12.3 KB


In [31]:
# simple preprocessing steps
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [32]:
# create X input variable and Y output variable to run the model
X = []
sentences = list(new_dataframe["text"])
for sen in sentences:
    X.append(preprocess_text(sen))

y = class_labels.values


In [34]:
print(y)

[[0 1 0 0]
 [0 1 0 0]
 [0 1 0 0]
 ...
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]]


In [35]:
# split the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [36]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [37]:
##used the pretrianed glove embedding in the model
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open('/Users/sakthivel/lexisnexis/glove.6B/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [38]:
#LSTM model with different layer configuration
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(4, activation='sigmoid')(LSTM_Layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [54]:
#model summary
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 100)          1020700   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 516       
Total params: 1,138,464
Trainable params: 117,764
Non-trainable params: 1,020,700
_________________________________________________________________
None


In [39]:
#fit the mode
history = model.fit(X_train, y_train, batch_size=128, epochs=5, verbose=1, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Train on 668 samples, validate on 168 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [41]:
# Model evaluation
score = model.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.49499111345836094
Test Accuracy: 0.7595238095238095
