# Building a Knowledge Graph

In [1]:
import pyspark as sp
import nltk
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

### Required for environment variable

In [2]:
import os
os.environ["PYSPARK_PYTHON"]="/usr/local/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"]="/usr/local/bin/python3"

### Setup cluster

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark Sentiment Analysis example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

###  Load dataset

In [4]:
df_keywords = spark.read.csv("./mag_cs_keywords.csv",header=True)
df_arxiv = spark.read.json("./arxiv-metadata-oai-snapshot.json")

In [45]:
abstracts = df_arxiv.select("abstract")
keywords = df_keywords.select("normalizedName")

### Cleaning & Normalization

In [36]:
from pyspark.sql.functions import udf, col
from pyspark.sql import Row
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType

# remove non ASCII characters & lowercase
def strip_non_ascii(data_str):
    ''' Returns the string without non ASCII characters'''
    stripped = (c.lower() for c in data_str if 0 < ord(c) < 127)

    return ''.join(stripped)
# setup pyspark udf function
strip_non_ascii_udf = udf(strip_non_ascii, StringType())

In [46]:
abstracts = abstracts.withColumn('normalized', strip_non_ascii_udf(abstracts['abstract']))
abstracts = abstracts.select("normalized")

1. Extract keywords in document
    - https://stackoverflow.com/questions/48869922/how-to-efficiently-check-if-a-list-of-words-is-contained-in-a-spark-dataframe


2. Build Co-occurence matrix
    - https://stackoverflow.com/questions/48551900/spark-generate-occurrence-matrix