In [2]:
! bash bootstrap.sh


--2021-12-11 14:25:54--  https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles-multistream.xml.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.7, 2620:0:861:1:208:80:154:7
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.7|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

### Building Knowledge Graph
This application is about organizing information and making it easy to access by humans and computers alike.


### Implement the solution

 Import the libraries 

In [4]:
import json
import re
import pandas as pd
import sparknlp

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import lit, col,udf,explode , split

import sparknlp
from sparknlp import DocumentAssembler, Finisher
from sparknlp.annotator import *
from pyspark.sql.types import MapType, StringType, IntegerType, ArrayType 
import pandas as pd
import time
from neo4j import GraphDatabase, basic_auth
import time
from tqdm import tqdm
from urllib.request import urlopen
import urllib.request
from datetime import datetime

In [5]:
from utils.Neo4jConnection import Neo4jConnection
from utils.preprocess import cleanInfoBox,getPageIndexinCats,extractMovieEntity,extractPersonRelation

Start spark session

In [6]:
packages = [
    "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.4",
    'com.databricks:spark-xml_2.12:0.9.0'
]

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Knowledge Graph") \
    .config("spark.driver.memory", "12g") \
    .config("spark.jars.packages", ','.join(packages)) \
    .getOrCreate()

In [7]:
# spark.sparkContext.getConf().getAll()

##### Read the data from the wikipedia

In [8]:
data_start_time = time.time()
print("{} - started the data load into main dataframe".format(datetime.now()))

2021-12-11 14:26:22.749650 - started the data load into main dataframe


In [9]:
df = spark.read\
    .format('xml')\
    .option("rootTag", "mediawiki")\
    .option("rowTag", "page")\
    .load("*.bz2")\
    .persist()

In [10]:
getInfoboxUDF = udf(lambda x:cleanInfoBox(x))

Get the page indices from the categories of interest 

In [11]:
index_start_time = time.time()
print("{} - started the index extraction".format(datetime.now()))

2021-12-11 14:29:38.255469 - started the index extraction


Specify the category and wiki flag 

Please use the category for which you can find the data in wikipedia either simple wikipedia or english. By default , we have set this to simple to reduce the data

Allowed values for WIKI_FLAG is 'simple' or 'en'

In [12]:
cat_config_list = []
cat_config_file = open('category.config','r')
l = cat_config_file.readlines()
for ele in l:
  res = ele.replace('"','').replace('\n',"").split(",")
  cat_config_list.append(res)

In [13]:
cat_config_list

[['Movies_based_on_books', 'simple'],
 ['Indian_cinema', 'en'],
 ['Indian_films', 'en']]

In [15]:

pageIndexdf = spark.createDataFrame(getPageIndexinCats( cat_config_list,"N", "",[]), StringType()).withColumnRenamed("value","PageIndex")

pageContentdf = pageIndexdf.join(df,pageIndexdf.PageIndex == df.id,"inner").select('PageIndex','revision.text._VALUE')

print("Total pages: {}".format(pageContentdf.count()))

pageidDF = pageContentdf.filter(pageContentdf._VALUE.contains('{{Infobox')).select('PageIndex')
pageidlist = [int(row.PageIndex) for row in pageidDF.collect()]



2021-12-11 14:30:59.266456 - Completed fetching page index for category : ['Movies_based_on_books', 'simple'], (Cumulative) Records found: 500
2021-12-11 14:30:59.348590 - No record found - Fetching page index for category : ['Indian_cinema', 'en'], (Cumulative) Records found: 500
2021-12-11 14:30:59.478816 - Completed fetching page index for category : ['Indian_films', 'en'], (Cumulative) Records found: 1000


In [43]:
pageidDF = pageContentdf.filter(pageContentdf._VALUE.contains('{{Infobox')).select('PageIndex').distinct()
pageidlist = [int(row.PageIndex) for row in pageidDF.collect()]

In [44]:
print("{} - Total pages selected after filter Count : {} ".format(datetime.now(),len(pageidlist)))


2021-12-11 14:48:25.677193 - Total pages selected after filter Count : 118 


Process the data for the category of interest


In [45]:
data_process_start_time = time.time()
print("{} - started reading the data ".format(datetime.now()))

2021-12-11 14:48:25.688665 - started reading the data 


In [46]:
data = df.filter('redirect IS NULL').selectExpr('id','title',
    'revision.text._VALUE AS text'
).filter('redirect IS NULL').filter('ns == 0')

In [47]:
InfoData = data.filter(col('id').isin(pageidlist)).cache()

In [48]:
# InfoData.select(getInfoboxUDF('text').alias('info')).collect()

Extract the Movie entities 

In [49]:
movieEntityUDF = udf(lambda x : extractMovieEntity(x) ,MapType(StringType(),StringType())  )

In [50]:
infodata= InfoData.select('title',getInfoboxUDF(col('text')).alias('info'))

In [51]:
movieEntityNeodf = infodata.select('title',movieEntityUDF('info').alias('MovieEntity')).toPandas()


In [53]:
movieEntityDFlist = list(movieEntityNeodf['MovieEntity'])

In [54]:
movieEntityLoadDF = pd.DataFrame(movieEntityDFlist)

#### Extract relation between Movie and person

In [55]:
PersonRelationUDF = udf(lambda x : extractPersonRelation(x),ArrayType(StringType()))

In [56]:
relationPersonDF = infodata.select('title',PersonRelationUDF('info').alias('extractPersonRelation')).select('title',explode(col('extractPersonRelation')).alias('PersonRelation'))

In [57]:
movie_person_rel_df = relationPersonDF.select('title',split(col('PersonRelation'),'~')[0].alias('relation'),split(col('PersonRelation'),'~')[1].alias('person')).toPandas()

In [58]:
movie_person_rel_df.tail()

Unnamed: 0,title,relation,person
1011,Eye of the Needle (movie),starring,Kate Nelligan
1012,Eye of the Needle (movie),starring,Christopher Cazenove
1013,Eye of the Needle (movie),producer,Stephen J. Friedman
1014,Eye of the Needle (movie),director,Richard Marquand
1015,Eye of the Needle (movie),music,Miklós Rózsa


Get details associated with Person

In [59]:
person_entity = pd.DataFrame(movie_person_rel_df['person'].unique(),columns=['name'])

In [60]:
end_time = time.time()
print("{} - completed the data processing ".format(datetime.now()))

2021-12-11 14:48:31.222533 - completed the data processing 


In [61]:
print(end_time-data_start_time)
print(index_start_time - data_start_time)
print(end_time-data_process_start_time)
print( data_process_start_time - data_start_time )

1328.4729092121124
195.50582766532898
5.533868312835693
1322.9390408992767


 Loading Data into Neo 4 j

In [62]:
neo_config_file = open('neo4j.config','r')
contents = neo_config_file.read().split('\n')
uri = contents[0].split('=')[1]
pwd = contents[1].split('=')[1]
user= contents[2].split('=')[1]
conn = Neo4jConnection(uri,user,pwd)

Clean the existing in graph db

In [64]:
delete_all_nodes = 'MATCH (n) DETACH DELETE n;'

conn.query(delete_all_nodes)

[]

Load Movie data into the  node in Neo4j

In [65]:
query = '''
UNWIND $rows as row
 CREATE (e:Movie {  title : row.name, name : row.name ,budget : row.budget, released :row.released ,runtime :row.runtime } )
 '''
batch_size = 1000
batch_id = 0 
while batch_id < len(movieEntityLoadDF)/batch_size:

  res = conn.query(query, parameters = {'rows':movieEntityLoadDF[batch_id*batch_size: (batch_id+1)*batch_size].reset_index().to_dict('records')})
  batch_id += 1

Load the data into the node in Neo4j

In [66]:
query = '''
UNWIND $rows as row
 CREATE (e:Person { name : row.name } )
 '''

batch_size = 1000
batch_id = 0 

while batch_id < len(person_entity)/batch_size:

  res = conn.query(query, parameters = {'rows':person_entity[batch_id*batch_size: (batch_id+1)*batch_size].reset_index().to_dict('records')})
  batch_id += 1

Load the relationship between Movie and person 

In [67]:
query = ''' 
UNWIND $rows as row
MATCH (entity1:Movie {name: row.title}),(entity2:Person {name: row.person})
CALL apoc.create.relationship(entity1, row.relation,NULL, entity2) YIELD rel
RETURN entity1.name, type(rel), entity2.name 
'''


batch_size = 10
batch_id = 1 

while batch_id < len(movie_person_rel_df)/batch_size:
# while batch_id < 3:

  res = conn.query(query, parameters = {'rows':movie_person_rel_df[batch_id*batch_size: (batch_id+1)*batch_size].reset_index(drop=True).to_dict('records')})
  batch_id += 1

In [68]:
print("{} - completed the loading data to neo4j ".format(datetime.now()))

2021-12-11 14:48:35.252685 - completed the loading data to neo4j 
