In [135]:
#import
import os.path
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn as s
import dataframe_image as dfi
import networkx as nx
import mantel

In [136]:
def load_data(path,file_name):
    """
    Input  : path and file_name
    Purpose: loading text file
    Output : list of paragraphs/documents and
             title(initial 100 words considred as title of document)
    """
    documents_list = []
    titles=[]
    document = ''
    i=0
    with open( os.path.join(path, file_name) ,"r") as fin:
        data = fin.read()
        words = data.split()
        fin.seek(0)
        print('Number of words in text file :', len(words))
        for line in fin.readlines():
            text = line.strip()
            i=i+1
            if text !='':
                document = document+text
                if text == 'NEW_ARTICLE':
                    documents_list.append(document)
                    document = ''
        documents_list.append(document)
    print("Total Number of Documents:",len(documents_list))
    titles.append( text[0:min(len(text),100)] )
    return documents_list,titles,len(words)

In [137]:
document_list,titles,word_count_a=load_data("","law-texts-all.txt")

Number of words in text file : 45105
Total Number of Documents: 3


In [138]:
def preprocess_data(doc_set):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        tokens = [i for i in tokens if len(i)>3]
        print(tokens)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [139]:
clean_text_a=preprocess_data(document_list)

['regina', 'secretary', 'state', 'homedepartment', 'parte', 'hindley', 'court', 'appeal', '2000', '152hearing', 'dates', 'october', 'november', '19985', 'november', '1998catchwords', 'prisons', 'prisoners', 'rights', 'release', 'licence', 'mandatory', 'life', 'sentence', 'prisoner', 'tariff', 'element', 'determinate', 'length', 'provisionally', 'fixed', 'communicated', 'prisoner', 'whole', 'life', 'tariff', 'subsequently', 'fixed', 'communicated', 'policy', 'review', 'whole', 'life', 'tariff', 'limited', 'considerations', 'retribution', 'deterrence', 'later', 'amended', 'take', 'account', 'exceptional', 'circumstances', 'whether', 'increase', 'from', 'determinate', 'tariff', 'whole', 'life', 'tariff', 'lawful', 'whether', 'review', 'policy', 'lawful', 'criminal', 'justice', '1991', 'crime', 'sentences', '1997', '29headnote', '1966', 'applicant', 'defendant', 'were', 'convicted', 'murder', 'children', 'received', 'mandatory', 'life', 'sentences', 'applicant', 'also', 'convicted', 'being

In [140]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix

In [141]:
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
    #print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

In [142]:
# LSA Model
number_of_topics=1
words=word_count_a
model=create_gensim_lsa_model(clean_text_a,number_of_topics,words)
words_law_article = dict(model.show_topic(0, topn=words))##dict and encoding matrix values

In [143]:
top_words_article = dict(model.show_topic(0, topn=5)) 

In [144]:
document_list,titles,word_count_b=load_data("","non-law-all.txt")

Number of words in text file : 34133
Total Number of Documents: 4


In [145]:
clean_text_b=preprocess_data(document_list)

['hardest', 'case', 'myra', 'hindley', 'life', 'sentences', 'rule', 'lawmyra', 'hindley', 'still', 'prison', 'years', 'after', 'conviction', 'twomurders', 'perpetrated', 'with', 'lover', 'brady', 'these', 'crimes', 'known', 'asthe', 'moors', 'murders', 'deeply', 'embedded', 'collective', 'consciousnessof', 'society', 'exaggeration', 'that', 'hindley', 'popularlyconsidered', 'embodiment', 'evil', 'over', 'last', 'years', 'hasmounted', 'legal', 'challenges', 'home', 'secretary', 'right', 'determine', 'thelength', 'detention', 'this', 'paper', 'argue', 'that', 'this', 'single', 'hardcase', 'irrevocable', 'effect', 'penal', 'policy', 'united', 'kingdom', 'thesis', 'that', 'hindley', 'mounted', 'these', 'challenges', 'positionof', 'home', 'secretary', 'hardened', 'main', 'because', 'politicalimpact', 'executive', 'action', 'that', 'could', 'lead', 'release', 'becausehindley', 'mandatory', 'life', 'prisoner', 'case', 'profoundimpact', 'relating', 'such', 'prisoners', 'prevented', 'fromremain

In [146]:
# LSA Model
number_of_topics=1
words=word_count
model=create_gensim_lsa_model(clean_text_b,number_of_topics,words)
words_non_law_article =dict(model.show_topic(0, topn=words))#dict and encoding matrix values

In [147]:
top_non_law_article = dict(model_c.show_topic(0,topn=5)) 

In [148]:
top_words_article

{'tariff': 0.33293849716563323,
 'secretari': 0.33147720211273735,
 'prison': 0.24915911995674675,
 'life': 0.23611708519236568,
 'sentenc': 0.22952296370340297}

In [149]:
top_non_law_article

{'judg': 0.32073578367328615,
 'evid': 0.3092444069593231,
 'west': 0.28040281700236924,
 'trial': 0.2772485745197229,
 'case': 0.22810405823057162}

In [150]:
c={} #law words in non law
for key in top_words_article:
    try:
        c[key] = words_non_law_article[key]
    except KeyError:
        c[key] = 0

In [151]:
c

{'tariff': 0.036483731632387925,
 'secretari': 0.02128251424217142,
 'prison': 0.03639047810748174,
 'life': 0.03741741504765616,
 'sentenc': 0.04646241740792445}

In [152]:
top_non_law_article.update(c)

In [153]:
top_non_law_article

{'judg': 0.32073578367328615,
 'evid': 0.3092444069593231,
 'west': 0.28040281700236924,
 'trial': 0.2772485745197229,
 'case': 0.22810405823057162,
 'tariff': 0.036483731632387925,
 'secretari': 0.02128251424217142,
 'prison': 0.03639047810748174,
 'life': 0.03741741504765616,
 'sentenc': 0.04646241740792445}

In [154]:
b={} #non law words in law
for key in top_non_law_article:
    try:
        b[key] = words_law_article[key]
    except KeyError:
        b[key] = 0

In [155]:
b

{'judg': 0.05935970953578713,
 'evid': 0.009507116299140419,
 'west': 0.0005793405702712192,
 'trial': 0.024182678798960827,
 'case': 0.15545250632649332,
 'tariff': 0.33293849716563323,
 'secretari': 0.33147720211273735,
 'prison': 0.24915911995674675,
 'life': 0.23611708519236568,
 'sentenc': 0.22952296370340297}

In [156]:
top_words_article.update(b)

In [157]:
top_words_article

{'tariff': 0.33293849716563323,
 'secretari': 0.33147720211273735,
 'prison': 0.24915911995674675,
 'life': 0.23611708519236568,
 'sentenc': 0.22952296370340297,
 'judg': 0.05935970953578713,
 'evid': 0.009507116299140419,
 'west': 0.0005793405702712192,
 'trial': 0.024182678798960827,
 'case': 0.15545250632649332}

In [158]:
# LSA Model
number_of_topics = 3 #
words = word_count_a
model = create_gensim_lsa_model(clean_text_a,number_of_topics,words)

a=pd.DataFrame()

for i in range(number_of_topics):
    words_from_law =dict(model.show_topic(i, topn=words))
    b=pd.DataFrame(words_from_law,index=[i])
    a=a.append(b)

a=a.transpose()



df = pd.DataFrame(columns=list(top_words_article.keys()), 
                  index = list(top_words_article.keys()))

l = list(top_words_article.keys())
print('cosine-similarity')
for i in l:
    for j in l:
            matrix=np.array([a.loc[i],a.loc[j]])
            n = s.metrics.pairwise.cosine_similarity(matrix, matrix, dense_output=True)
            df.loc[[i],[j]] = n[0,1]
print(df)

dfi.export(df, 'hello.png')

cosine-similarity
             tariff secretari    prison       life   sentenc      judg  \
tariff            1  0.983798  0.752886   0.981629  0.574831  0.467278   
secretari  0.983798         1  0.684687   0.938672  0.427497  0.337331   
prison     0.752886  0.684687         1   0.853834  0.833889  0.890528   
life       0.981629  0.938672  0.853834          1  0.712817  0.627305   
sentenc    0.574831  0.427497  0.833889   0.712817         1  0.949908   
judg       0.467278  0.337331  0.890528   0.627305  0.949908         1   
evid       0.156692  0.327257 -0.063293  0.0323719 -0.595669 -0.469139   
west       0.227186  0.387768 -0.306324  0.0462224 -0.666282  -0.70468   
trial      0.221077   0.11331  0.801684   0.396159  0.791636  0.940053   
case       0.633599  0.694786  0.736667   0.639247  0.241242    0.3966   

                evid       west     trial      case  
tariff      0.156692   0.227186  0.221077  0.633599  
secretari   0.327257   0.387768   0.11331  0.694786  
priso

In [159]:
df1 = df.to_numpy()

dist1 = list(df1[np.triu_indices(10, k = 1)])

In [161]:
# LSA Model
number_of_topics = 4 #
words = word_count_b
model = create_gensim_lsa_model(clean_text_b,number_of_topics,words)

a=pd.DataFrame()

for i in range(number_of_topics):
    words_from_law =dict(model.show_topic(i, topn=words))
    b=pd.DataFrame(words_from_law,index=[i])
    a=a.append(b)

a=a.transpose()



df = pd.DataFrame(columns=list(top_words_article.keys()), 
                  index = list(top_words_article.keys()))

l = list(top_words_article.keys())
print('cosine-similarity')
for i in l:
    for j in l:
            matrix=np.array([a.loc[i],a.loc[j]])
            n = s.metrics.pairwise.cosine_similarity(matrix, matrix, dense_output=True)
            df.loc[[i],[j]] = n[0,1]
print(df)

dfi.export(df, 'hello.png')

cosine-similarity
              tariff  secretari     prison       life    sentenc       judg  \
tariff             1   0.999325   0.998177   0.996725   0.988737 -0.0312698   
secretari   0.999325          1   0.997174   0.995374   0.993538 -0.0323026   
prison      0.998177   0.997174          1   0.999691    0.98644 -0.0433802   
life        0.996725   0.995374   0.999691          1   0.983745 -0.0610466   
sentenc     0.988737   0.993538    0.98644   0.983745          1 -0.0319897   
judg      -0.0312698 -0.0323026 -0.0433802 -0.0610466 -0.0319897          1   
evid       -0.197841  -0.198978  -0.207032  -0.223472  -0.197032   0.985121   
west        -0.26626  -0.282149  -0.226927  -0.221254  -0.316208   0.513522   
trial     -0.0873411 -0.0917632 -0.0905418  -0.105008  -0.099385   0.985122   
case        0.749558    0.74889   0.754368   0.745563   0.746004   0.594254   

               evid      west      trial      case  
tariff    -0.197841  -0.26626 -0.0873411  0.749558  
secret

In [162]:
df1 = df.to_numpy()

dist2 = list(df1[np.triu_indices(10, k = 1)])

In [163]:
dist1 #law distances

[0.9837981884654161,
 0.7528862881374415,
 0.9816290103657856,
 0.5748312002479229,
 0.4672777595295246,
 0.1566918717016346,
 0.22718595517278378,
 0.22107684886327125,
 0.6335989914282464,
 0.6846866805989494,
 0.938672108494046,
 0.42749740850426415,
 0.33733116372492006,
 0.3272568157886515,
 0.38776767398329276,
 0.11331010630132474,
 0.6947862466092247,
 0.8538338129806589,
 0.8338891598859102,
 0.8905276349582111,
 -0.06329295250987327,
 -0.30632449275263085,
 0.8016839868357507,
 0.7366667190568814,
 0.7128174735658315,
 0.6273047996534886,
 0.032371918870161165,
 0.04622244853796709,
 0.3961591774092781,
 0.6392471553199677,
 0.9499080309199243,
 -0.5956688415513042,
 -0.6662817693523507,
 0.7916360189026661,
 0.2412419041366761,
 -0.4691386544110278,
 -0.704680338943085,
 0.9400525843844477,
 0.3965996026311073,
 0.8517193067060852,
 -0.3637661715496885,
 0.6218602107527732,
 -0.7406813477102862,
 0.29029594882597143,
 0.4278079711266452]

In [164]:
dist2 #non law distances

[0.9993250532072969,
 0.998176557387009,
 0.9967252178687431,
 0.988736807109235,
 -0.031269792147230295,
 -0.19784086963084724,
 -0.2662601529852098,
 -0.08734110564391634,
 0.7495579006105243,
 0.9971737616155294,
 0.9953738701741648,
 0.9935381633556938,
 -0.03230259053465887,
 -0.19897804268523536,
 -0.28214937917420596,
 -0.09176315778314326,
 0.7488897283347095,
 0.9996905684222459,
 0.986439997984196,
 -0.04338016207928843,
 -0.2070315126616561,
 -0.226927030818883,
 -0.09054179556667648,
 0.7543684745889913,
 0.983745073197515,
 -0.06104657167944739,
 -0.2234722017210383,
 -0.221254299431398,
 -0.10500830604282566,
 0.7455627384259442,
 -0.031989653516731886,
 -0.19703183144919864,
 -0.31620813247070156,
 -0.09938497099240462,
 0.7460041670777645,
 0.9851206798026981,
 0.513522435463045,
 0.9851215039505481,
 0.594253906160702,
 0.5782958068906131,
 0.9857661432584711,
 0.4662651847582268,
 0.6528642363709535,
 0.27367571283655123,
 0.5720476330437047]

In [166]:
mantel.test(dist1, dist2, perms=10000, method='pearson', tail='upper')

(0.3351340268541357, 0.0385, 2.1982037307664792)