### Importing libraries

In [1]:
import os
from os import listdir
from os.path import isfile, join
from itertools import islice

# save into text file with json
import json

# spacy for tokenize document
import spacy
from collections import Counter
import re
nlp = spacy.load("en")

In [2]:
def load_book(path):    
    input_file = os.path.join(path)
    with open(input_file) as f:
        chapter = f.read()
    return chapter

In [3]:
# Collect all of the chapters file names
path = './book/'
chapter_files = [f for f in listdir(path) if isfile(join(path, f))]

In [4]:
chapter_files

['chapter - 1.txt',
 'chapter - 11.txt',
 'chapter - 12.txt',
 'chapter - 14.txt',
 'chapter - 2.txt',
 'chapter - 3.txt',
 'chapter - 4.txt',
 'chapter - 6.txt',
 'chapter - 7.txt']

In [5]:
# Load the book using the file names
chapters = []
for chapter in chapter_files:
    chapters.append(load_book(path+chapter))

In [6]:
# Getting number of words in each chapter from the book 
total_word = 0
for i in range(len(chapters)):
    print("The number of {} words in chapter of named {}.".format(len(chapters[i].split()), chapter_files[i]))
    total_word += len(chapters[i].split())

The number of 13815 words in chapter of named chapter - 1.txt.
The number of 11523 words in chapter of named chapter - 11.txt.
The number of 13084 words in chapter of named chapter - 12.txt.
The number of 18405 words in chapter of named chapter - 14.txt.
The number of 6936 words in chapter of named chapter - 2.txt.
The number of 13542 words in chapter of named chapter - 3.txt.
The number of 11964 words in chapter of named chapter - 4.txt.
The number of 11951 words in chapter of named chapter - 6.txt.
The number of 11207 words in chapter of named chapter - 7.txt.


In [7]:
print("The total of {} words in the corpus.".format(total_word))

The total of 112427 words in the corpus.


In [8]:
# Getting part 1
with open('book\\chapter - 1.txt', 'r') as reader:
    chapter_1 = reader.read()

with open('book\\chapter - 2.txt', 'r') as reader:
    chapter_2 = reader.read()

with open('book\\chapter - 3.txt', 'r') as reader:
    chapter_3 = reader.read()

with open('book\\chapter - 4.txt', 'r') as reader:
    chapter_4 = reader.read()


In [9]:
# Getting part 2
with open('book\\chapter - 6.txt', 'r') as reader:
    chapter_6 = reader.read()

with open('book\\chapter - 7.txt', 'r') as reader:
    chapter_7 = reader.read()
    

In [10]:
# Getting part 3
with open('book\\chapter - 11.txt', 'r') as reader:
    chapter_11 = reader.read()

with open('book\\chapter - 12.txt', 'r') as reader:
    chapter_12 = reader.read()

In [11]:
# Getting part 4
with open('book\\chapter - 14.txt', 'r') as reader:
    chapter_14 = reader.read()


### Splitting corpus into train ratio of 0.60, validation ratio of 0.20 and testing ratio with 0.20
train_ratio = 0.60;
validation_ratio = 0.20;
test_ratio = 0.20

1. chapter 1 - 13815
Training set - 8289 (60%)
Validation set - 2763 (20%)
Test set - 2763 (20%)

2. chapter 2 - 6936 
training set - 4162 (60%)
validation set - 1387 (20%)
test set - 1387 (20%)

3. chapter 3 - 13542
Training set - 8125 (60%)
Validation set - 2708 (20%)
Test set - 2708 (20%)

4. chapter 4 - 11964
Training set - 7178 (60%)
Validation set - 2393 (20%)
Test set - 2393 (20%)


5. chapter 6 - 11951
Training set - 7171 (60%)
Validation set - 2390 (20%)
Test set - 2390 (20%)

6. chapter 7 - 11207
Training set - 6725 (60%)
Validation set - 2241 (20%)
Test set - 2241 (20%)


7. chapter 11 - 11523
Training set - 6914 (60%)
Validation set - 2763 (20%)
Test set - 2763 (20%)

8. chapter 12 - 13084
Training set - 7850 (60%)
Validation set - 2617 (20%)
Test set - 2617 (20%)


9. chapter 14 - 18405
Training set - 11043 (60%)
Validation set - 3681 (20%)
Test set - 3681 (20%)


5. Merging all training set into trainingset.txt, validation set into validation.txt, and testing set into test.txt


In [12]:
train_ratio = 0.60
validation_ratio = 0.20
test_ratio = 0.20

In [13]:
with open('training_data\\training set.txt', 'r') as reader:
    train_set = reader.read()
    
with open('training_data\\validation set.txt', 'r') as reader:
    validation_set = reader.read()
    
with open('training_data\\testing set.txt', 'r') as reader:
    testing_set = reader.read()

In [14]:
print("Training set has approximately " + str(len(train_set.split(" "))) + " words.")
print("Validation set has approximately " + str(len(validation_set.split(" "))) + " words.")
print("Testing set has approximately " + str(len(testing_set.split(" "))) + " words.")

Training set has approximately 67130 words.
Validation set has approximately 22064 words.
Testing set has approximately 23298 words.


### Sentence Segmentation for training set, validation set and testing set

In [15]:
# Training set
doc_1 = nlp(train_set)
sentences_1 = [sent.string.strip() for sent in doc_1.sents]

# Validation set
doc_2 = nlp(validation_set)
sentences_2 = [sent.string.strip() for sent in doc_2.sents]

# Testing set
doc_3 = nlp(testing_set)
sentences_3 = [sent.string.strip() for sent in doc_3.sents]

In [16]:
sentences_1

['the potential for psychologists to assist the legal system has been recognized since the early twentieth century , but only within the past fifty years has psychology begun to realize this potential in meaningful ways .',
 'this progress has included newly developed professional organizations , such as the american psychology law society and the international association for correctional and forensic psychology ; graduate , internship , and fellowship programs in the specialty area ; organizations devoted to certifying qualified practitioners , such as the american board of forensic psychology and the american board of police and public safety psychology ; such scientific journals as law and human behavior , behavioral sciences and the law , and criminal justice and behavior ; and books devoted to the interface of psychology and law .',
 'this specialty area has continued to grow rapidly since the previous edition of the handbook of forensic psychology was published in two thousand a

In [17]:
sentences_2

['chapter five describes training models and resources in forensic psychology for faculty developing programs of instruction and for students and general practitioners seeking specialized education or supervised experience in forensic psychology .',
 'part two comprises five chapters concerning applications of psychology in civil proceedings .',
 'chapter six addresses family law procedures and issues related to conducting evaluations of children and their parents involved in disputed custody .',
 'chapter twelve traces the development and current applications of the concepts of criminal responsibility and legal insanity .',
 'chapter thirteen delineates the related nuances of criminal intent and diminished capacity .',
 'part four presents information on seven special applications of forensic psychology .',
 'chapter fourteen leads off with a discussion of violence risk research and assessment , and chapter fifteen follows with an overview of emerging roles for psychologists in law en

In [18]:
sentences_3

['this fourth edition of the handbook of forensic psychology , like its predecessors , aims to provide an authoritative and comprehensive resource for understanding the theoretical foundations of forensic psychology , becoming familiar with the expanding research base in this specialty , and learning to apply forensic concepts artfully in everyday practice .',
 'to this end , the contributors to this volume , as in the prior three editions , are accomplished scholars and practitioners in their respective areas .',
 'some are prominent academicians who conduct research and offer consultation .',
 'others are actively engaged service providers who also make significant contributions to the literature .',
 'several have degrees in law as well as psychology .',
 'these authors were asked to delineate the enduring issues in an area of their specialty and frame these issues in the light of contemporary research and prevailing conceptual formations .',
 'although similar in focus and structur

In [19]:
bigrams_1 = [b for l in sentences_1 for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_trainset = [w for w in bigrams_1 if "?" not in w and ":" not in w and "," not in w and ";" not in w and "." not in w]

In [20]:
bigrams_2 = [b for l in sentences_2 for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_validation_set = [w for w in bigrams_2 if "?" not in w and ":" not in w and "," not in w and ";" not in w and "." not in w]

In [21]:
bigrams_3 = [b for l in sentences_3 for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram_test_set = [w for w in bigrams_3 if "?" not in w and ":" not in w and "," not in w and ";" not in w and "." not in w]

In [22]:
bigram_trainset

[('the', 'potential'),
 ('potential', 'for'),
 ('for', 'psychologists'),
 ('psychologists', 'to'),
 ('to', 'assist'),
 ('assist', 'the'),
 ('the', 'legal'),
 ('legal', 'system'),
 ('system', 'has'),
 ('has', 'been'),
 ('been', 'recognized'),
 ('recognized', 'since'),
 ('since', 'the'),
 ('the', 'early'),
 ('early', 'twentieth'),
 ('twentieth', 'century'),
 ('but', 'only'),
 ('only', 'within'),
 ('within', 'the'),
 ('the', 'past'),
 ('past', 'fifty'),
 ('fifty', 'years'),
 ('years', 'has'),
 ('has', 'psychology'),
 ('psychology', 'begun'),
 ('begun', 'to'),
 ('to', 'realize'),
 ('realize', 'this'),
 ('this', 'potential'),
 ('potential', 'in'),
 ('in', 'meaningful'),
 ('meaningful', 'ways'),
 ('this', 'progress'),
 ('progress', 'has'),
 ('has', 'included'),
 ('included', 'newly'),
 ('newly', 'developed'),
 ('developed', 'professional'),
 ('professional', 'organizations'),
 ('such', 'as'),
 ('as', 'the'),
 ('the', 'american'),
 ('american', 'psychology'),
 ('psychology', 'law'),
 ('law', 

In [23]:
bigram_validation_set

[('chapter', 'five'),
 ('five', 'describes'),
 ('describes', 'training'),
 ('training', 'models'),
 ('models', 'and'),
 ('and', 'resources'),
 ('resources', 'in'),
 ('in', 'forensic'),
 ('forensic', 'psychology'),
 ('psychology', 'for'),
 ('for', 'faculty'),
 ('faculty', 'developing'),
 ('developing', 'programs'),
 ('programs', 'of'),
 ('of', 'instruction'),
 ('instruction', 'and'),
 ('and', 'for'),
 ('for', 'students'),
 ('students', 'and'),
 ('and', 'general'),
 ('general', 'practitioners'),
 ('practitioners', 'seeking'),
 ('seeking', 'specialized'),
 ('specialized', 'education'),
 ('education', 'or'),
 ('or', 'supervised'),
 ('supervised', 'experience'),
 ('experience', 'in'),
 ('in', 'forensic'),
 ('forensic', 'psychology'),
 ('part', 'two'),
 ('two', 'comprises'),
 ('comprises', 'five'),
 ('five', 'chapters'),
 ('chapters', 'concerning'),
 ('concerning', 'applications'),
 ('applications', 'of'),
 ('of', 'psychology'),
 ('psychology', 'in'),
 ('in', 'civil'),
 ('civil', 'proceeding

In [24]:
bigram_test_set

[('this', 'fourth'),
 ('fourth', 'edition'),
 ('edition', 'of'),
 ('of', 'the'),
 ('the', 'handbook'),
 ('handbook', 'of'),
 ('of', 'forensic'),
 ('forensic', 'psychology'),
 ('like', 'its'),
 ('its', 'predecessors'),
 ('aims', 'to'),
 ('to', 'provide'),
 ('provide', 'an'),
 ('an', 'authoritative'),
 ('authoritative', 'and'),
 ('and', 'comprehensive'),
 ('comprehensive', 'resource'),
 ('resource', 'for'),
 ('for', 'understanding'),
 ('understanding', 'the'),
 ('the', 'theoretical'),
 ('theoretical', 'foundations'),
 ('foundations', 'of'),
 ('of', 'forensic'),
 ('forensic', 'psychology'),
 ('becoming', 'familiar'),
 ('familiar', 'with'),
 ('with', 'the'),
 ('the', 'expanding'),
 ('expanding', 'research'),
 ('research', 'base'),
 ('base', 'in'),
 ('in', 'this'),
 ('this', 'specialty'),
 ('and', 'learning'),
 ('learning', 'to'),
 ('to', 'apply'),
 ('apply', 'forensic'),
 ('forensic', 'concepts'),
 ('concepts', 'artfully'),
 ('artfully', 'in'),
 ('in', 'everyday'),
 ('everyday', 'practice'

### Saving into text file (training set, validation set and testing set)

In [25]:
d1 = bigram_trainset
json.dump(d1, open("bigrams_train_set.txt",'w'))

In [26]:
d2 = bigram_validation_set
json.dump(d2, open("bigrams_validation_set.txt",'w'))

In [27]:
d3 = bigram_test_set
json.dump(d3, open("bigrams_test_set.txt",'w'))

### Getting bigram frequency on training set, validation set and testing set

In [28]:
# Training set
words_train_set = re.findall("\w+", train_set)
bigram_freq_training_set = Counter(zip(words_train_set, islice(words_train_set, 1, None)))
print(bigram_freq_training_set)






In [29]:
# Validation set
words_validation_set = re.findall("\w+", validation_set)
bigram_freq_validation_set = Counter(zip(words_validation_set, islice(words_validation_set, 1, None)))
print(bigram_freq_validation_set)






In [30]:
# Test set
words_test_set = re.findall("\w+", testing_set)
bigram_freq_test_set = Counter(zip(words_test_set, islice(words_test_set, 1, None)))
print(bigram_freq_test_set)

Counter({('of', 'the'): 207, ('to', 'the'): 114, ('in', 'the'): 105, ('that', 'the'): 61, ('and', 'the'): 59, ('on', 'the'): 56, ('it', 'is'): 56, ('in', 'a'): 46, ('the', 'plaintiff'): 45, ('the', 'defendant'): 40, ('to', 'be'): 38, ('by', 'the'): 35, ('with', 'the'): 33, ('of', 'a'): 31, ('for', 'the'): 30, ('the', 'legal'): 29, ('at', 'the'): 28, ('as', 'a'): 28, ('the', 'law'): 27, ('is', 'not'): 26, ('mental', 'health'): 26, ('child', 'custody'): 26, ('forensic', 'psychology'): 25, ('may', 'be'): 25, ('can', 'be'): 23, ('should', 'be'): 23, ('to', 'a'): 22, ('in', 'this'): 21, ('the', 'court'): 21, ('his', 'or'): 21, ('or', 'her'): 21, ('risk', 'factors'): 21, ('has', 'been'): 20, ('of', 'these'): 20, ('the', 'evaluation'): 20, ('the', 'first'): 19, ('in', 'which'): 19, ('that', 'is'): 19, ('do', 'not'): 18, ('about', 'the'): 18, ('the', 'expert'): 17, ('is', 'a'): 17, ('of', 'violence'): 17, ('of', 'forensic'): 16, ('to', 'provide'): 16, ('based', 'on'): 16, ('have', 'been'): 15,




In [31]:
bigram_freq_train = dict((';'.join(k), v) for k,v in bigram_freq_training_set.items())
bigram_freq_validation = dict((';'.join(k), v) for k,v in bigram_freq_validation_set.items())
bigram_freq_test = dict((';'.join(k), v) for k,v in bigram_freq_test_set.items())

In [32]:
bigram_freq_train

{'the;potential': 10,
 'potential;for': 8,
 'for;psychologists': 7,
 'psychologists;to': 13,
 'to;assist': 21,
 'assist;the': 7,
 'the;legal': 74,
 'legal;system': 36,
 'system;has': 2,
 'has;been': 65,
 'been;recognized': 2,
 'recognized;since': 1,
 'since;the': 9,
 'the;early': 14,
 'early;twentieth': 2,
 'twentieth;century': 10,
 'century;but': 1,
 'but;only': 4,
 'only;within': 3,
 'within;the': 19,
 'the;past': 18,
 'past;fifty': 2,
 'fifty;years': 2,
 'years;has': 2,
 'has;psychology': 1,
 'psychology;begun': 1,
 'begun;to': 3,
 'to;realize': 1,
 'realize;this': 1,
 'this;potential': 1,
 'potential;in': 1,
 'in;meaningful': 1,
 'meaningful;ways': 1,
 'ways;this': 1,
 'this;progress': 1,
 'progress;has': 1,
 'has;included': 1,
 'included;newly': 1,
 'newly;developed': 1,
 'developed;professional': 1,
 'professional;organizations': 5,
 'organizations;such': 5,
 'such;as': 57,
 'as;the': 60,
 'the;american': 22,
 'american;psychology': 3,
 'psychology;law': 3,
 'law;society': 3,
 's

In [33]:
d_1 = bigram_freq_train
json.dump(d_1, open("bigram_freq_trainset.txt",'w'))

d_2 = bigram_freq_validation
json.dump(d_2, open("bigram_freq_validationset.txt",'w'))

d_3 = bigram_freq_test
json.dump(d_3, open("bigram_freq_testset.txt",'w'))

### Combine training and validation set in order to increase performance.

In [34]:
d_combine_class = d1 + d2

In [35]:
print("Training set has approximately " + str(len(d1)) + " bigrams.")
print("Validation set has approximately " + str(len(d2)) + " bigrams.")
print("Testing set has approximately " + str(len(d3)) + " bigrams.")

Training set has approximately 55114 bigrams.
Validation set has approximately 18064 bigrams.
Testing set has approximately 19316 bigrams.


In [36]:
print("Combined training and validation set that have approximately " + str(len(d_combine_class)) + " bigrams.")

Combined training and validation set that have approximately 73178 bigrams.


In [37]:
# saving combined set into bigrams.combined.txt
json.dump(d_combine_class, open("bigrams_combined_set.txt",'w'))

In [38]:
combined_set = train_set + validation_set

In [39]:
# Creating bigrams frequency - combined set
words_combined_set = re.findall("\w+", combined_set)
bigram_freq_combined_set = Counter(zip(words_combined_set, islice(words_combined_set, 1, None)))
bigram_freq_combined_set = dict((';'.join(k), v) for k,v in bigram_freq_combined_set.items())
d_combined_set = bigram_freq_combined_set
json.dump(d_combined_set, open("bigram_freq_combined_set.txt",'w'))