In [1]:
import spacy
from spacy import displacy
import os
import pandas as pd
import numpy as np
from textpipe import doc, pipeline
import py2neo

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
nlp.Defaults.stop_words.remove("empty")

In [4]:
# Altogether: to remove stopwords, punctuation and pronouns
# Putting them all in one function will enhance performance (reduces the need to loop through the words multiple times)

def process_text(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    return " ".join(result)

# This only returns tokens that are not stop words, punctuations or pronouns.

In [5]:
# Now, before you compare the sentences for similarity (based on the common words used)
# Pre-process the text first

def calculate_similarity (text1, text2):
    base = nlp(process_text(text1))
    other = nlp(process_text(text2))
    return base.similarity(other)

In [6]:
# After exploring, it's time to explore how to make a similarity matrix as Meo mentioned!
# First, change the current directory to load the data.
os.getcwd()
os.chdir("C:\\Users\\Shirley Ow\\Documents\\Internships\\DSTA\\articles")

In [7]:
# Load the data (100 articles)
files = {}
for filename in os.listdir():
    if not filename in files:
        with open(filename, "r", encoding="utf-8") as file:
            files[filename] = file.read()
for filename, text in files.items():
    print(filename)

1.txt
10.txt
100.txt
11.txt
12.txt
13.txt
14.txt
15.txt
16.txt
17.txt
18.txt
19.txt
2.txt
20.txt
21.txt
22.txt
23.txt
24.txt
25.txt
26.txt
27.txt
28.txt
29.txt
3.txt
30.txt
31.txt
32.txt
33.txt
34.txt
35.txt
36.txt
37.txt
38.txt
39.txt
4.txt
40.txt
41.txt
42.txt
43.txt
44.txt
45.txt
46.txt
47.txt
48.txt
49.txt
5.txt
50.txt
51.txt
52.txt
53.txt
54.txt
55.txt
56.txt
57.txt
58.txt
59.txt
6.txt
60.txt
61.txt
62.txt
63.txt
64.txt
65.txt
66.txt
67.txt
68.txt
69.txt
7.txt
70.txt
71.txt
72.txt
73.txt
74.txt
75.txt
76.txt
77.txt
78.txt
79.txt
8.txt
80.txt
81.txt
82.txt
83.txt
84.txt
85.txt
86.txt
87.txt
88.txt
89.txt
9.txt
90.txt
91.txt
92.txt
93.txt
94.txt
95.txt
96.txt
97.txt
98.txt
99.txt


In [33]:
files["1.txt"]
# reading decoded file stored in dictionary

'\nSINGAPORE: Voters will get a recommended time-band to cast their ballots on polling day, so as to spread out the crowd across polling hours, the Elections Department (ELD) said on Monday (Jun 8). The time-band was among a range of contingency plans announced on Monday by the ELD to ensure safe voting if the next General Election (GE) is held during the COVID-19 outbreak. \n\n\nAdvertisement\n\n\n\n\nAdvertisement\n\n\nOther measures include having more polling stations, dedicating time-bands\xa0for seniors to vote and requiring voters to sanitise their hands and to wear gloves. “These measures will protect the health and safety of voters, candidates and election officials, and ensure that the next GE can be conducted safely should it be held during the COVID-19 situation,” ELD said in a press release. \n\n\n\n\nVoters will be allotted a recommended two-hour voting time-band, which will be indicated on their hard copy poll card and e-Poll card on the SingPass mobile app, said authori

In [8]:
# Let's also try StackOverFlow questions and answers and create a similarity matrix.
# May also be more similar to the actual data since we import the csv instead of loading .txt files.
# Identify similar questions / answers on StackOverFlow.
df_questions = pd.read_csv("../stackoverflow_archive/Questions.csv", nrows = 100, encoding = "ISO-8859-1", usecols=['Id','Title','Body'])
df_questions

Unnamed: 0,Id,Title,Body
0,80,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
...,...,...,...
95,9570,What libraries do I need to link my mixed-mode...,<p>I'm integrating .NET support into our C++ a...
96,9650,Lisp/Scheme interpreter without Emacs?,<p><br />\nI've been wanting to teach myself L...
97,9750,How can I reverse the ON bits in a byte?,<p>I was reading Joel's book where he was sugg...
98,10190,How to return a page of results from SQL?,<p>Many applications have grids that display d...


In [9]:
df_answers = pd.read_csv("../stackoverflow_archive/Answers.csv", nrows = 100, encoding = "ISO-8859-1", usecols=['Id','Body'])
df_answers

Unnamed: 0,Id,Body
0,92,"<p><a href=""http://svnbook.red-bean.com/"">Vers..."
1,124,<p>I wound up using this. It is a kind of a ha...
2,199,<p>I've read somewhere the human eye can't dis...
3,269,"<p>Yes, I thought about that, but I soon figur..."
4,307,"<p><a href=""http://www.codeproject.com/Article..."
...,...,...
95,2855,<p>I believe you'd need to perform a separate ...
96,2908,"<p>Just to check, if you use <strong>just</str..."
97,2911,<p>Does the apache user require a password to ...
98,2915,<p>If indeed you are able to insert using the ...


In [10]:
# In case necessary, the tags for the questions as well
df_tags = pd.read_csv("../stackoverflow_archive/Tags.csv")

In [11]:
arr = []
for i in df_questions['Title']:
    processed = process_text(i)
    arr.append(processed)

In [12]:
# Before text-processing
for i in df_questions['Title']:
    print(i)

SQLStatement.execute() - multiple queries in one statement
Good branching and merging tutorials for TortoiseSVN?
ASP.NET Site Maps
Function for creating color wheels
Adding scripting functionality to .NET applications
Should I use nested classes in this case?
Homegrown consumption of web services
Deploying SQL Server Databases from Test to Live
Automatically update version number
Visual Studio Setup Project - Per User Registry Settings
How do I connect to a database and loop over a recordset in C#?
How to get the value of built, encoded ViewState?
How do I delete a file which is locked by another process in C#?
Process size on UNIX
Use SVN Revision to label build in CCNET
How to make subdomain user accounts in a webapp
Is nAnt still supported and suitable for .net 3.5/VS2008?
Is Windows Server 2008 "Server Core" appropriate for a SQL Server instance?
What is the best way to copy a database?
Can I logically reorder columns in a table?
.NET Unit Testing packages?
Federated (Synced) Subve

In [18]:
# After text-processing
arr

['sqlstatement.execute multiple query statement',
 'good branching merge tutorial tortoisesvn',
 'asp.net site map',
 'function create color wheel',
 'add scripting functionality .net application',
 'use nested class case',
 'homegrown consumption web service',
 'deploy sql server database test live',
 'automatically update version number',
 'visual studio setup project user registry setting',
 'connect database loop recordset c',
 'value build encode viewstate',
 'delete file lock process c',
 'process size unix',
 'use svn revision label build ccnet',
 'subdomain user account webapp',
 'nant support suitable .net 3.5/vs2008',
 'window server 2008 server core appropriate sql server instance',
 'good way copy database',
 'logically reorder column table',
 '.net unit testing package',
 'federated synced subversion server',
 'language use postgresql trigger store procedure',
 'convert hashbyte varchar',
 'datatable vs dataset',
 'traverse collection classic asp',
 'disable browser autoco

In [12]:
# Create similarity matrix
# Note that in numpy arrays, values in the array has to be convertible to float
q_mat = np.zeros((100,100))
q_mat[0][0]

0.0

In [13]:
%%time
pipe = list(nlp.pipe(arr))
for i in range(100):
    for j in range(100):
        q_mat[i][j] = pipe[i].similarity(pipe[j])

Wall time: 335 ms


In [15]:
q_mat # similarity matrix for the StackOverflow questions 

array([[1.        , 0.38372008, 0.44707168, ..., 0.46967229, 0.72692547,
        0.61441491],
       [0.38372008, 1.        , 0.4117212 , ..., 0.32509217, 0.40987384,
        0.33339281],
       [0.44707168, 0.4117212 , 1.        , ..., 0.28041277, 0.5865648 ,
        0.38710311],
       ...,
       [0.46967229, 0.32509217, 0.28041277, ..., 1.        , 0.51083135,
        0.60545497],
       [0.72692547, 0.40987384, 0.5865648 , ..., 0.51083135, 1.        ,
        0.66238817],
       [0.61441491, 0.33339281, 0.38710311, ..., 0.60545497, 0.66238817,
        1.        ]])

In [16]:
# Let's do the similarity matrix for the StackOverflow answers first
arr_a = []
for i in df_answers['Body']:
    processed = process_text(doc.Doc(i).clean)
    arr_a.append(processed)

In [27]:
arr_a

['version control subversion good resource source control general tortoisesvn specific',
 'wound kind hack actually work pretty thing careful semicolon d var strsql string = stream.readutfbytes(stream.bytesavailable var number = 0 var strsqlsplit array = strsql.split = 0 < strsqlsplit.length i++ nonquery(strsqlsplit[i].tostring',
 'read human eye distinguish 4 value apart mind follow algorithm compensate sure exactly want way randomly generate non repeating color value beware inconsistent pseudo code ahead //colors enter 0 255 r g b]colors = //holds final color usedrand = new random();//assumes n 16,777,216randomgen(int n len(colors < n //generate random number 0,255 color newre = rand.next(256 newgreen = rand.next(256 newblue = rand.next(256 temp = newre newgreen newblue //only add new color array temp color colors.append(temp way optimize well visibility compare distance new color color array item color itemsq = item[0]^2 + item[1]^2 + item[2]^2])^(.5 tempsq = temp[0]^2 + temp[1]^2 +

In [17]:
a_mat = np.zeros((100,100))
pipe = list(nlp.pipe(arr_a))
for i in range(100):
    for j in range(100):
        a_mat[i][j] = pipe[i].similarity(pipe[j])

In [26]:
a_mat

array([[1.        , 0.47942228, 0.63165737, ..., 0.72565272, 0.73937004,
        0.71400207],
       [0.47942228, 1.        , 0.85548362, ..., 0.63792517, 0.59357432,
        0.69816077],
       [0.63165737, 0.85548362, 1.        , ..., 0.72990478, 0.70364044,
        0.78389852],
       ...,
       [0.72565272, 0.63792517, 0.72990478, ..., 1.        , 0.94712571,
        0.93534248],
       [0.73937004, 0.59357432, 0.70364044, ..., 0.94712571, 1.        ,
        0.93435556],
       [0.71400207, 0.69816077, 0.78389852, ..., 0.93534248, 0.93435556,
        1.        ]])

In [18]:
# For the Singapore news articles, there are some stop words that we should add.
# These words appear in many of the articles, hence should not be considered in the similarity score.

nlp.Defaults.stop_words |= {"singapore", "read", "bookmark", "comprehensive", "coverage", "developmentsdownload", "app", "subscribe", "telegram", "channel", "latest", "updates","https://cna", "asia", "telegram"}

In [47]:
process_text(doc.Doc(files["1.txt"]).clean)

'voter recommend time band cast ballot polling day spread crowd polling hour election department eld say monday jun 8) time band range contingency plan announce monday eld ensure safe voting general election ge hold covid-19 outbreak advertisement advertisement measure include have polling station dedicate time band senior vote require voter sanitise hand wear glove measure protect health safety voter candidate election official ensure ge conduct safely hold covid-19 situation eld say press release voter allot recommend hour voting time band indicate hard copy poll card e poll card singpass mobile say authority advertisement advertisement covid-19 safety measure polling day voter need know choice general election timing say pm lee senior voter age 65 give time band morning 8 noon eld say elector bring non voter child polling station senior voter require help accompany household member eld add voter vote time polling station open encourage cast ballot allot recommend time band time band

In [19]:
# Now we're ready to do this similarity matrix
arr_art = []
for key, value in files.items():
    processed = process_text(doc.Doc(value).clean)
    arr_art.append(processed)

In [51]:
arr_art

['voter recommend time band cast ballot polling day spread crowd polling hour election department eld say monday jun 8) time band range contingency plan announce monday eld ensure safe voting general election ge hold covid-19 outbreak advertisement advertisement measure include have polling station dedicate time band senior vote require voter sanitise hand wear glove measure protect health safety voter candidate election official ensure ge conduct safely hold covid-19 situation eld say press release voter allot recommend hour voting time band indicate hard copy poll card e poll card singpass mobile say authority advertisement advertisement covid-19 safety measure polling day voter need know choice general election timing say pm lee senior voter age 65 give time band morning 8 noon eld say elector bring non voter child polling station senior voter require help accompany household member eld add voter vote time polling station open encourage cast ballot allot recommend time band time ban

In [20]:
art_mat = np.zeros((100,100))
pipe = list(nlp.pipe(arr_art))
for i in range(100):
    for j in range(100):
        art_mat[i][j] = pipe[i].similarity(pipe[j])

In [94]:
art_mat

array([[1.        , 0.96220397, 0.8361966 , ..., 0.9064206 , 0.86285406,
        0.90238165],
       [0.96220397, 1.        , 0.83720543, ..., 0.90165967, 0.84407513,
        0.88748273],
       [0.8361966 , 0.83720543, 1.        , ..., 0.83598445, 0.81722362,
        0.8613957 ],
       ...,
       [0.9064206 , 0.90165967, 0.83598445, ..., 1.        , 0.8744901 ,
        0.89560171],
       [0.86285406, 0.84407513, 0.81722362, ..., 0.8744901 , 1.        ,
        0.89065467],
       [0.90238165, 0.88748273, 0.8613957 , ..., 0.89560171, 0.89065467,
        1.        ]])

In [21]:
# Verify that this similarity scores are accurate
# Integrate this with the graph

# Used to mask the diagonals when calculating maximum value in array afterwards
mask = np.ones(art_mat.shape, dtype=bool)
np.fill_diagonal(mask, 0)

print(np.amin(art_mat))
print(np.where(art_mat == np.amin(art_mat))) # Articles that were most dissimilar from each other.
print(np.amax(art_mat[mask]))
print(np.where(art_mat == np.amax(art_mat[mask]))) # Articles that were the most similar to each other. (indeed similar)

0.5958938932780352
(array([59, 69], dtype=int64), array([69, 59], dtype=int64))
0.9943773943872979
(array([16, 24], dtype=int64), array([24, 16], dtype=int64))


In [22]:
os.getcwd()
os.chdir("C:\\Users\\Shirley Ow\\Documents\\Internships\\DSTA\\articles")
os.listdir()[24]

'30.txt'

In [103]:
# Since the arrays are zero-indexed, we will look at 
process_text(doc.Doc(files['23.txt']).clean)

'report 157 new covid-19 infection noon tuesday jul 7 include child age 15 20 community case child student bedok view secondary school east spring primary school jurong west primary school jurong west secondary school advertisement advertisement 9 new location add list place visit covid-19 case infectious symptomatic unlinked covid-19 case community great concern number daily case fall gan kim yongall student link household infection say ministry education moe medium release moe say teacher assumption pathway school tuesday case student home quarantine order ministry health moh ringfencing measure later swab close contact household member advertisement advertisement 2-year old singaporean boy link previous case tuesday new infection bring total number case 45,140 10 singaporeans permanent resident work pass holder work permit holder new community case 20 case community pick result proactive surveillance screening 11 place quarantine earlier say moh case asymptomatic detect proactive te

In [104]:
process_text(doc.Doc(files['30.txt']).clean)

'report 183 new covid-19 infection noon monday jul 6 include 23 case community say ministry health moh bring national tally 44,983 advertisement advertisement monday report represent high number new covid-19 case community apr 24 25 case singaporeans permanent resident new community case work pass holder 19 work permit holder new community case 18 pick result moh proactive surveillance screening place quarantine earlier 17 community case link advertisement advertisement seventeen community case link previous case cluster say moh 13 detect screening worker essential service work dormitory seven migrant worker reside temporary accommodation arrange company say moh pcr test result indicate low viral load detectable amplification cycle mean past infection say ministry serological test conduct worker determine current past infection say moh meantime ministry investigate possible exposure link appropriate action prevent transmission moh say link community case identify contact previously con

In [23]:
from neo4j import GraphDatabase

uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j","123abc"))

In [23]:
def createNodes(tx, art):
    tx.run("CREATE(article:Article{name:$art})", art=art)

In [112]:
with driver.session() as session:
    for i in os.listdir():
        session.write_transaction(createNodes, i)

In [24]:
# Use values in similarity matrix as weights of the edges
# Create the relationship with the weight as property
def addWeight(tx, i, j, wt):
    if i == j:
        return
    name1 = os.listdir()[i]
    name2 = os.listdir()[j]
    tx.run("MATCH (art1: Article {name: $name1}), (art2: Article {name: $name2}) "
           "CREATE (art1)-[:SIMILAR {score: $wt}]->(art2)", 
           name1 = name1, name2 = name2, wt = wt)

In [116]:
# Loop in 2D array (similarity matrix)
with driver.session() as session:
    for i in range(100):
        for j in range(100):
            session.write_transaction(addWeight, i, j, art_mat[i,j])

In [26]:
# Add different properties to the Article nodes, say add the country of origin
def addProp(tx, name, cty):
    tx.run("MATCH (art: Article {name: $name}) "
           "SET art.country = $cty",
           name = name, cty = cty)

In [65]:
def delLab(tx, name):
    tx.run("MATCH (art {name: $name}) "
           "REMOVE art:Australia",
           name = name)

In [62]:
with driver.session() as session: 
    for i in range(33):
        name = os.listdir()[i]
        session.write_transaction(addProp, name, "Singapore")

In [38]:
with driver.session() as session: 
    for i in range(33,67):
        name = os.listdir()[i]
        session.write_transaction(addProp, name, "Thailand")

In [57]:
with driver.session() as session: 
    for i in range(67,100):
        name = os.listdir()[i]
        session.write_transaction(addProp, name, "Sri Lanka")

In [26]:
# Colour based on node properties
MATCH (n:Article)
WITH DISTINCT n.country AS country, collect(DISTINCT n) AS articles
CALL apoc.create.addLabels(articles, [apoc.text.upperCamelCase(country)]) YIELD node
RETURN *
# Apparently by adding the properties as additional labels.
# This 'permanently' adds those properties as additional labels, and different colours will be given to each subgroup!
# or actually can use APOC to create a virtual subgraph? (so that this is not permanent, just with the query)

In [41]:
# Add different properties to the Article nodes, say add a numeric property
def addNumProp(tx, name, num):
    tx.run("MATCH (art: Article {name: $name}) "
           "SET art.ranNum = $num",
           name = name, num = num)

In [43]:
# Set some properties that are numeric
import random
with driver.session() as session: 
    for i in range(100):
        name = os.listdir()[i]
        session.write_transaction(addNumProp, name, random.randint(10,100))

In [44]:
# Resize based on node's numeric properties --> don't think this is possible in neo4j.

In [54]:
# Adjust thickness of the edges
# Available on neovis: https://medium.com/neo4j/graph-visualization-with-neo4j-using-neovis-js-a2ecaaa7c379 (but this uses HTML & JS)

In [70]:
# Using py2neo, instead of neo4j
from py2neo import Graph
graph = Graph(password="123abc")
graph.run("MATCH (n) RETURN n").to_table()

n
"(_0:Article:Singapore {country: 'Singapore', name: '1.txt', ranNum: 90})"
"(_1:Article:Singapore {country: 'Singapore', name: '10.txt', ranNum: 19})"
"(_2:Article:Singapore {country: 'Singapore', name: '100.txt', ranNum: 17})"
"(_3:Article:Singapore {country: 'Singapore', name: '11.txt', ranNum: 52})"
"(_4:Article:Singapore {country: 'Singapore', name: '12.txt', ranNum: 43})"
"(_5:Article:Singapore {country: 'Singapore', name: '13.txt', ranNum: 27})"
"(_6:Article:Singapore {country: 'Singapore', name: '14.txt', ranNum: 55})"
"(_7:Article:Singapore {country: 'Singapore', name: '15.txt', ranNum: 64})"
"(_8:Article:Singapore {country: 'Singapore', name: '16.txt', ranNum: 65})"
"(_9:Article:Singapore {country: 'Singapore', name: '17.txt', ranNum: 60})"
