### Importing libraries

In [1]:
# setting corpus directory
import os
from os import listdir
from os.path import isfile, join

# save into text file with json
import json

# spacy for tokenize document
import spacy
from collections import Counter

# Regular expression to remove puntuation.
import re

# Spacy to extract english texts
nlp = spacy.load("en")

In [2]:
# Load all the book part
def load_book(path):    
    input_file = os.path.join(path)
    with open(input_file) as f:
        chapter = f.read()
    return chapter

In [3]:
# Collect all of the chapters file names
path = './book/'
chapter_files = [f for f in listdir(path) if isfile(join(path, f))]

In [4]:
chapter_files

['chapter - 1.txt',
 'chapter - 11.txt',
 'chapter - 12.txt',
 'chapter - 14.txt',
 'chapter - 2.txt',
 'chapter - 3.txt',
 'chapter - 4.txt',
 'chapter - 6.txt',
 'chapter - 7.txt']

In [5]:
# Load the book using the file names
chapters = []
for chapter in chapter_files:
    chapters.append(load_book(path+chapter))

In [6]:
# Getting number of words in each chapter from the book 
total_word = 0
for i in range(len(chapters)):
    print("The number of {} words in chapter of named {}.".format(len(chapters[i].split()), chapter_files[i]))
    total_word += len(chapters[i].split())

The number of 13815 words in chapter of named chapter - 1.txt.
The number of 11523 words in chapter of named chapter - 11.txt.
The number of 13084 words in chapter of named chapter - 12.txt.
The number of 18405 words in chapter of named chapter - 14.txt.
The number of 6936 words in chapter of named chapter - 2.txt.
The number of 13542 words in chapter of named chapter - 3.txt.
The number of 11964 words in chapter of named chapter - 4.txt.
The number of 11951 words in chapter of named chapter - 6.txt.
The number of 11207 words in chapter of named chapter - 7.txt.


In [7]:
print("The total of {} words in the corpus.".format(total_word))

The total of 112427 words in the corpus.


In [8]:
# Getting part 1
with open('book\\chapter - 1.txt', 'r') as reader:
    chapter_1 = reader.read()

with open('book\\chapter - 2.txt', 'r') as reader:
    chapter_2 = reader.read()

with open('book\\chapter - 3.txt', 'r') as reader:
    chapter_3 = reader.read()

with open('book\\chapter - 4.txt', 'r') as reader:
    chapter_4 = reader.read()


In [9]:
# Getting part 2
with open('book\\chapter - 6.txt', 'r') as reader:
    chapter_6 = reader.read()

with open('book\\chapter - 7.txt', 'r') as reader:
    chapter_7 = reader.read()
    

In [10]:
# Getting part 3
with open('book\\chapter - 11.txt', 'r') as reader:
    chapter_11 = reader.read()

with open('book\\chapter - 12.txt', 'r') as reader:
    chapter_12 = reader.read()

In [11]:
# Getting part 4
with open('book\\chapter - 14.txt', 'r') as reader:
    chapter_14 = reader.read()


### Splitting corpus into train ratio of 0.60, validation ratio of 0.20 and testing ratio with 0.20
train_ratio = 0.60;
validation_ratio = 0.20;
test_ratio = 0.20

1. chapter 1 - 13815
Training set - 8289 (60%)
Validation set - 2763 (20%)
Test set - 2763 (20%)

2. chapter 2 - 6936 
training set - 4162 (60%)
validation set - 1387 (20%)
test set - 1387 (20%)

3. chapter 3 - 13542
Training set - 8125 (60%)
Validation set - 2708 (20%)
Test set - 2708 (20%)

4. chapter 4 - 11964
Training set - 7178 (60%)
Validation set - 2393 (20%)
Test set - 2393 (20%)


5. chapter 6 - 11951
Training set - 7171 (60%)
Validation set - 2390 (20%)
Test set - 2390 (20%)

6. chapter 7 - 11207
Training set - 6725 (60%)
Validation set - 2241 (20%)
Test set - 2241 (20%)


7. chapter 11 - 11523
Training set - 6914 (60%)
Validation set - 2763 (20%)
Test set - 2763 (20%)

8. chapter 12 - 13084
Training set - 7850 (60%)
Validation set - 2617 (20%)
Test set - 2617 (20%)


9. chapter 14 - 18405
Training set - 11043 (60%)
Validation set - 3681 (20%)
Test set - 3681 (20%)


5. Merging all training set into trainingset.txt, validation set into validation.txt, and testing set into test.txt


In [12]:
train_ratio = 0.60
validation_ratio = 0.20
test_ratio = 0.20

In [15]:
with open('training_data\\training set.txt', 'r') as reader:
    train_set = reader.read()
    
with open('training_data\\validation set.txt', 'r') as reader:
    validation_set = reader.read()
    
with open('training_data\\testing set.txt', 'r') as reader:
    testing_set = reader.read()

In [16]:
print("Training set has approximately " + str(len(train_set.split(" "))) + " words.")
print("Validation set has approximately " + str(len(validation_set.split(" "))) + " words.")
print("Testing set has approximately " + str(len(testing_set.split(" "))) + " words.")

Training set has approximately 67130 words.
Validation set has approximately 22064 words.
Testing set has approximately 23298 words.


### Creating dictionary (to get frequency) for training set, validation set and testing set

#### Remove puntuation (i.e. !.?) 

In [17]:
def remove_punctuation(pattern,phrase):
    for pat in pattern:
        return(re.findall(pat,phrase))
        return('\n')
            
# Capturing pattern like .!?
pattern=['[^!.?]+']

In [18]:
train_set = "".join(remove_punctuation(pattern,train_set))
validation_set = "".join(remove_punctuation(pattern,validation_set))
testing_set = "".join(remove_punctuation(pattern,testing_set))

#### Word Tokenize in SpaCy
This step is similar to previous procedure to remove puntuation, but this is in tokenize format, texts may not be removed completely without word tokenization. 
Texts will empty strings will also be removed in this step.

In [19]:
# Training set
doc_train_set = nlp(train_set)
tokenzie_texts_trainset = [token.text for token in doc_train_set]

# Validation set
doc_validation_set = nlp(validation_set)
tokenzie_texts_validation_set = [token.text for token in doc_validation_set]

# Testing set
doc_test_set = nlp(testing_set)
tokenzie_texts_test_set = [token.text for token in doc_test_set]


In [20]:
print(tokenzie_texts_trainset)






In [21]:
print(tokenzie_texts_validation_set)






In [22]:
print(tokenzie_texts_test_set)

['this', 'fourth', 'edition', 'of', 'the', 'handbook', 'of', 'forensic', 'psychology', ',', 'like', 'its', 'predecessors', ',', 'aims', 'to', 'provide', 'an', 'authoritative', 'and', 'comprehensive', 'resource', 'for', 'understanding', 'the', 'theoretical', 'foundations', 'of', 'forensic', 'psychology', ',', 'becoming', 'familiar', 'with', 'the', 'expanding', 'research', 'base', 'in', 'this', 'specialty', ',', 'and', 'learning', 'to', 'apply', 'forensic', 'concepts', 'artfully', 'in', 'everyday', 'practice', ' ', 'to', 'this', 'end', ',', 'the', 'contributors', 'to', 'this', 'volume', ',', 'as', 'in', 'the', 'prior', 'three', 'editions', ',', 'are', 'accomplished', 'scholars', 'and', 'practitioners', 'in', 'their', 'respective', 'areas', ' ', 'some', 'are', 'prominent', 'academicians', 'who', 'conduct', 'research', 'and', 'offer', 'consultation', ' ', 'others', 'are', 'actively', 'engaged', 'service', 'providers', 'who', 'also', 'make', 'significant', 'contributions', 'to', 'the', 'lit




#### Remove puntuation to get frequency distribution

In [23]:
nonPunct = re.compile('.*[A-Za-z0-9].*')

# trainset
filtered_texts_trainset = [w for w in tokenzie_texts_trainset if nonPunct.match(w)]
counts_train_set = Counter(filtered_texts_trainset)

# validation set
filtered_texts_validation_set = [w for w in tokenzie_texts_validation_set if nonPunct.match(w)]
counts_validation_set = Counter(filtered_texts_validation_set)

# test set
filtered_texts_testset = [w for w in tokenzie_texts_test_set if nonPunct.match(w)]
counts_test_set = Counter(filtered_texts_testset)

In [24]:
counts_train_set

Counter({'the': 4081,
         'potential': 36,
         'for': 598,
         'psychologists': 200,
         'to': 1835,
         'assist': 33,
         'legal': 284,
         'system': 89,
         'has': 170,
         'been': 169,
         'recognized': 7,
         'since': 22,
         'early': 34,
         'twentieth': 10,
         'century': 16,
         'but': 121,
         'only': 71,
         'within': 35,
         'past': 24,
         'fifty': 25,
         'years': 35,
         'psychology': 176,
         'begun': 3,
         'realize': 1,
         'this': 354,
         'in': 1528,
         'meaningful': 7,
         'ways': 25,
         'progress': 8,
         'included': 21,
         'newly': 2,
         'developed': 33,
         'professional': 65,
         'organizations': 10,
         'such': 151,
         'as': 526,
         'american': 32,
         'law': 234,
         'society': 12,
         'and': 1952,
         'international': 4,
         'association': 14,
         

In [25]:
counts_validation_set

Counter({'chapter': 18,
         'five': 9,
         'describes': 3,
         'training': 8,
         'models': 3,
         'and': 641,
         'resources': 2,
         'in': 508,
         'forensic': 69,
         'psychology': 39,
         'for': 191,
         'faculty': 1,
         'developing': 3,
         'programs': 3,
         'of': 783,
         'instruction': 1,
         'students': 11,
         'general': 19,
         'practitioners': 11,
         'seeking': 2,
         'specialized': 4,
         'education': 6,
         'or': 221,
         'supervised': 2,
         'experience': 11,
         'part': 13,
         'two': 29,
         'comprises': 2,
         'chapters': 5,
         'concerning': 12,
         'applications': 3,
         'civil': 18,
         'proceedings': 15,
         'six': 12,
         'addresses': 4,
         'family': 7,
         'law': 71,
         'procedures': 7,
         'issues': 28,
         'related': 21,
         'to': 615,
         'conducting': 4

In [26]:
counts_test_set

Counter({'this': 121,
         'fourth': 2,
         'edition': 5,
         'of': 873,
         'the': 1484,
         'handbook': 2,
         'forensic': 82,
         'psychology': 63,
         'like': 5,
         'its': 34,
         'predecessors': 1,
         'aims': 2,
         'to': 632,
         'provide': 29,
         'an': 122,
         'authoritative': 1,
         'and': 596,
         'comprehensive': 8,
         'resource': 1,
         'for': 202,
         'understanding': 18,
         'theoretical': 2,
         'foundations': 2,
         'becoming': 3,
         'familiar': 7,
         'with': 139,
         'expanding': 1,
         'research': 60,
         'base': 1,
         'in': 513,
         'specialty': 10,
         'learning': 4,
         'apply': 3,
         'concepts': 3,
         'artfully': 1,
         'everyday': 1,
         'practice': 30,
         'end': 5,
         'contributors': 2,
         'volume': 5,
         'as': 176,
         'prior': 17,
         'three'

#### Save into text files named dictionary_freq

In [27]:
d1 = counts_train_set
json.dump(d1, open("dictionary_freq_trainset.txt",'w'))

d2 = counts_validation_set
json.dump(d2, open("dictionary_freq_validationset.txt",'w'))

d3 = counts_test_set
json.dump(d3, open("dictionary_freq_testset.txt",'w'))

In [28]:
# To read json
fd_1 = json.load(open("dictionary_freq_trainset.txt"))
fd_2 = json.load(open("dictionary_freq_validationset.txt"))
fd_3 = json.load(open("dictionary_freq_testset.txt"))

In [29]:
fd_1.items()



In [30]:
fd_2.items()



In [31]:
fd_3.items()

dict_items([('this', 121), ('fourth', 2), ('edition', 5), ('of', 873), ('the', 1484), ('handbook', 2), ('forensic', 82), ('psychology', 63), ('like', 5), ('its', 34), ('predecessors', 1), ('aims', 2), ('to', 632), ('provide', 29), ('an', 122), ('authoritative', 1), ('and', 596), ('comprehensive', 8), ('resource', 1), ('for', 202), ('understanding', 18), ('theoretical', 2), ('foundations', 2), ('becoming', 3), ('familiar', 7), ('with', 139), ('expanding', 1), ('research', 60), ('base', 1), ('in', 513), ('specialty', 10), ('learning', 4), ('apply', 3), ('concepts', 3), ('artfully', 1), ('everyday', 1), ('practice', 30), ('end', 5), ('contributors', 2), ('volume', 5), ('as', 176), ('prior', 17), ('three', 22), ('editions', 2), ('are', 134), ('accomplished', 2), ('scholars', 4), ('practitioners', 5), ('their', 83), ('respective', 1), ('areas', 16), ('some', 39), ('prominent', 2), ('academicians', 1), ('who', 58), ('conduct', 15), ('offer', 7), ('consultation', 6), ('others', 6), ('actively

### Adding validation set into training set (combine frequency)

In [32]:
# Combined training set and validation set
fd_combined_set = d1 + d2

In [33]:
fd_combined_set

Counter({'the': 5422,
         'potential': 48,
         'for': 789,
         'psychologists': 263,
         'to': 2450,
         'assist': 39,
         'legal': 362,
         'system': 107,
         'has': 227,
         'been': 219,
         'recognized': 7,
         'since': 27,
         'early': 38,
         'twentieth': 12,
         'century': 19,
         'but': 151,
         'only': 102,
         'within': 47,
         'past': 33,
         'fifty': 32,
         'years': 47,
         'psychology': 215,
         'begun': 3,
         'realize': 1,
         'this': 449,
         'in': 2036,
         'meaningful': 10,
         'ways': 31,
         'progress': 12,
         'included': 27,
         'newly': 2,
         'developed': 40,
         'professional': 81,
         'organizations': 14,
         'such': 201,
         'as': 696,
         'american': 47,
         'law': 305,
         'society': 13,
         'and': 2593,
         'international': 4,
         'association': 16,
     

In [34]:
json.dump(fd_combined_set, open("dictionary_freq_combinedset.txt",'w'))

In [35]:
fd_combine = json.load(open("dictionary_freq_combinedset.txt"))

In [36]:
fd_combine.items()

