# Script to load conceptnet data with unique edges.

*Issue*: loading edges fails for all unique edges. check methods to resolve this.

## Clone the Data

In [51]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
cd '/content/drive/MyDrive'

/content/drive/MyDrive


In [53]:
#To-Do: Comment-out and clone the graph data(*.csv) once.

# !git clone https://github.com/sudha-vijayakumar/LanguageModel.git

## Peek into the data 

### ConceptNet - Word

In [54]:
import pandas as pd 
nodes = pd.read_csv('/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/WN-nodes.csv')
nodes.head(1)

Unnamed: 0,uri,id,word,pos,definition,subject
0,http://wordnet-rdf.princeton.edu/id/00001740-a,00001740-a,able,adjective,(usually followed by `to') having the necessar...,adj.all


### ConceptNet - Synset

In [55]:
synsets = pd.read_csv('/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/synsets.csv')
synsets.head(1)

Unnamed: 0,id:ID,pos:string,definition:string,:LABEL
0,able.a.01,a,(usually followed by `to') having the necessar...,Synset


In [76]:
synsets_enc = pd.read_csv('/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/encoded/synsets-encoded.csv',index_col=0)
synsets_enc.head(1)

# def get_synset(id):
#     try:
#       s = id.split('.')
#       return s[0]
#     except:
#       return ""
        
# synsets_enc['name'] = synsets_enc.apply(lambda x: get_synset(x['id:ID']),axis=1)
# synsets_enc.to_csv('/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/encoded/synsets-encoded.csv')

Unnamed: 0,id:ID,pos:string,definition:string,:LABEL,name
0,able.a.01,0,(usually followed by `to') having the necessar...,Synset,able


### ConceptNet - Edges

In [57]:
relationships = pd.read_csv('/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/relationships.csv',index_col=[0])
relationships.head(5)

Unnamed: 0_level_0,:END_ID,dataset:string,weight:double,:TYPE
:START_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
able.a.01,ability.n.01,/d/wordnet/3.1,2.0,Attribute
able.a.01,ability.n.02,/d/wordnet/3.1,2.0,Attribute
able.a,able.a.01,/d/wordnet/3.1,2.0,InSynset
unable.a.01,ability.n.01,/d/wordnet/3.1,2.0,Attribute
unable.a,unable.a.01,/d/wordnet/3.1,2.0,InSynset


### ConceptNet - Part of Speech

In [58]:
relationships = pd.read_csv('/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/encoded/pos_wn.csv',index_col=[0])
relationships.head()

Unnamed: 0,type,id
0,adjective,0
1,noun,3
2,adverb,2
3,verb,4
4,adjective_satellite,1


## Install pyTigerGraph

In [59]:
!pip install -U pyTigerGraph



## Add Imports and Establish Initial Connection

In [60]:
# Imports
import pyTigerGraph as tg
import json
import pandas as pd

# Connection parameters
hostName = "https://language.i.tgcloud.io/"
userName = "tigergraph"
password = "tigergraph"

conn = tg.TigerGraphConnection(host=hostName, username=userName, password=password)

print("Connected")

Connected


## Define and Publish the Schema

- Include the edges for the relation types included: **['Antonym','Synonym','RelatedTo']** to generate the gsql create edge query. 
- This will eliminate the effort to type in queries if there are too many relation types.

In [61]:
rel_type = pd.read_csv('/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/encoded/type_rel.csv')
rel = rel_type['type']

rel_included = ['Antonym','Synonym','RelatedTo']

str_=""
# for r in rel:
for r in rel_included:
  if '/' in r:
    r = r.replace('/','_')
  str_+="CREATE DIRECTED EDGE "+r+"(FROM synset, TO synset|FROM synset, TO word|FROM word, TO synset|FROM word, TO word) WITH REVERSE_EDGE=\"reverse_"+r+"\""

  str_+="\n"
print(str_)

CREATE DIRECTED EDGE Antonym(FROM synset, TO synset|FROM synset, TO word|FROM word, TO synset|FROM word, TO word) WITH REVERSE_EDGE="reverse_Antonym"
CREATE DIRECTED EDGE Synonym(FROM synset, TO synset|FROM synset, TO word|FROM word, TO synset|FROM word, TO word) WITH REVERSE_EDGE="reverse_Synonym"
CREATE DIRECTED EDGE RelatedTo(FROM synset, TO synset|FROM synset, TO word|FROM word, TO synset|FROM word, TO word) WITH REVERSE_EDGE="reverse_RelatedTo"



- Copy & paste the above query within USE Global gsql block after EDGE:is_pos.

In [62]:
# DEFINE / CREATE ALL EDGES AND VERTICES 
results = conn.gsql('''
USE GLOBAL
CREATE VERTEX synset(PRIMARY_ID id STRING, definition STRING, pos STRING,name STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"
CREATE VERTEX word(PRIMARY_ID id STRING, name STRING, pos STRING, conceptUri STRING, label STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"
CREATE VERTEX part_of_speech(PRIMARY_ID id STRING, type STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"
CREATE DIRECTED EDGE has_pos(FROM synset, TO part_of_speech|FROM word, TO part_of_speech) WITH REVERSE_EDGE="reverse_has_pos"
CREATE DIRECTED EDGE is_pos_of(FROM part_of_speech, TO synset|FROM part_of_speech, TO word) WITH REVERSE_EDGE="reverse_is_pos_of"


CREATE DIRECTED EDGE Antonym(FROM synset, TO synset|FROM synset, TO word|FROM word, TO synset|FROM word, TO word) WITH REVERSE_EDGE="reverse_Antonym"
CREATE DIRECTED EDGE Synonym(FROM synset, TO synset|FROM synset, TO word|FROM word, TO synset|FROM word, TO word) WITH REVERSE_EDGE="reverse_Synonym"
CREATE DIRECTED EDGE RelatedTo(FROM synset, TO synset|FROM synset, TO word|FROM word, TO synset|FROM word, TO word) WITH REVERSE_EDGE="reverse_RelatedTo"

  ''')
print(results)

Successfully created vertex types: [synset].
Successfully created vertex types: [word].
Successfully created vertex types: [part_of_speech].
Successfully created edge types: [has_pos].
Successfully created reverse edge types: [reverse_has_pos].
Successfully created edge types: [is_pos_of].
Successfully created reverse edge types: [reverse_is_pos_of].
Successfully created edge types: [Antonym].
Successfully created reverse edge types: [reverse_Antonym].
Successfully created edge types: [Synonym].
Successfully created reverse edge types: [reverse_Synonym].
Successfully created edge types: [RelatedTo].
Successfully created reverse edge types: [reverse_RelatedTo].


## ConceptNet

## Create ConceptNet Graph

- generate the edges for the relation types included: **['Antonym','Synonym','RelatedTo']** to be included in the graph.

In [63]:
rel = rel_type['type']

str_=""
# for r in rel:
for r in rel_included:
  if '/' in r:
    r=r.replace('/','_')
  str_+=r+","
print(str_.strip(','))

Antonym,Synonym,RelatedTo


- Copy & paste the above after EDGE: is_pos_of.

In [64]:
results = conn.gsql('CREATE GRAPH ConceptNet(synset, word, part_of_speech,has_pos,is_pos_of,Antonym,Synonym,RelatedTo)')
print(results)

The graph ConceptNet is created.


- Generate AuthToken to update the graph

In [65]:
conn.graphname="ConceptNet"
secret = conn.createSecret()
print(secret)
authToken = conn.getToken(secret)
authToken = authToken[0]
print(authToken)
# authToken = 'rc7reopbis1667ksgcppq5v5fb99p6s1'
conn = tg.TigerGraphConnection(host=hostName, graphname="ConceptNet", username=userName, password=password, apiToken=authToken)

def pprint(string):
  print(json.dumps(string, indent=2))

pftfgjc3fk90sa3g6b5uf5jv33gdb08f
jfkjtupbfj5nd2ecn1e4hfpfvlm18h8u


## Create Loading Jobs

- Generate edge queries for the relation types included: **['Antonym','Synonym','RelatedTo']** to include in the load job gsql.

- Here it's important to note that the `$0`, `$1` values line up with the columns of your data.
In this example:
  - `$0` is the `uri` column,
  - `$1` is `id`,
  - `$2` is `word`
  - and so on

In [66]:
rel_type = pd.read_csv('/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/encoded/type_rel.csv')
rel = rel_type['type']


str_=""
# for r in rel:
for r in rel_included:
  orig=r
  if '/' in r:
    r = r.replace('/','_')
  str_+="LOAD MyDataSource TO EDGE  "+r+" VALUES($0 synset, $1 synset) WHERE $4 == \""+orig+"\" USING SEPARATOR=\",\""", HEADER=\"true\", EOL=""\"\\n"""
  str_+="\";\n"
  str_+="LOAD MyDataSource TO EDGE  "+r+" VALUES($0 synset, $1 word) WHERE $4 == \""+orig+"\" USING SEPARATOR=\",\""", HEADER=\"true\", EOL=""\"\\n"""
  str_+="\";\n"
  str_+="LOAD MyDataSource TO EDGE  "+r+" VALUES($0 word, $1 synset) WHERE $4 == \""+orig+"\" USING SEPARATOR=\",\""", HEADER=\"true\", EOL=""\"\\n"""
  str_+="\";\n"
  str_+="LOAD MyDataSource TO EDGE  "+r+" VALUES($0 word, $1 word) WHERE $4 == \""+orig+"\" USING SEPARATOR=\",\""", HEADER=\"true\", EOL=""\"\\n"""
  str_+="\";\n"
print(str_)

LOAD MyDataSource TO EDGE  Antonym VALUES($0 synset, $1 synset) WHERE $4 == "Antonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Antonym VALUES($0 synset, $1 word) WHERE $4 == "Antonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Antonym VALUES($0 word, $1 synset) WHERE $4 == "Antonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Antonym VALUES($0 word, $1 word) WHERE $4 == "Antonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Synonym VALUES($0 synset, $1 synset) WHERE $4 == "Synonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Synonym VALUES($0 synset, $1 word) WHERE $4 == "Synonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Synonym VALUES($0 word, $1 synset) WHERE $4 == "Synonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Synonym VALUES($0 word, $1 word) WHERE $4 == "Synonym" USING 

- Copy & Paste the above inside load_job_relationships gsql block.

In [67]:
results = conn.gsql('''
  USE GRAPH ConceptNet
  BEGIN

  CREATE LOADING JOB load_job_relationships FOR GRAPH ConceptNet {
        DEFINE FILENAME MyDataSource;

LOAD MyDataSource TO EDGE  Antonym VALUES($0 synset, $1 synset) WHERE $4 == "Antonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Antonym VALUES($0 synset, $1 word) WHERE $4 == "Antonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Antonym VALUES($0 word, $1 synset) WHERE $4 == "Antonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Antonym VALUES($0 word, $1 word) WHERE $4 == "Antonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Synonym VALUES($0 synset, $1 synset) WHERE $4 == "Synonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Synonym VALUES($0 synset, $1 word) WHERE $4 == "Synonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Synonym VALUES($0 word, $1 synset) WHERE $4 == "Synonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  Synonym VALUES($0 word, $1 word) WHERE $4 == "Synonym" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  RelatedTo VALUES($0 synset, $1 synset) WHERE $4 == "RelatedTo" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  RelatedTo VALUES($0 synset, $1 word) WHERE $4 == "RelatedTo" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  RelatedTo VALUES($0 word, $1 synset) WHERE $4 == "RelatedTo" USING SEPARATOR=",", HEADER="true", EOL="\n";
LOAD MyDataSource TO EDGE  RelatedTo VALUES($0 word, $1 word) WHERE $4 == "RelatedTo" USING SEPARATOR=",", HEADER="true", EOL="\n";

      }


  CREATE LOADING JOB load_job_pos_wn FOR GRAPH ConceptNet {
        DEFINE FILENAME MyDataSource;
        LOAD MyDataSource TO VERTEX part_of_speech VALUES($2, $1) USING SEPARATOR=",", HEADER="true", EOL="\n";
      }


  CREATE LOADING JOB load_job_synsets_encoded FOR GRAPH ConceptNet {
        DEFINE FILENAME MyDataSource;
        LOAD MyDataSource TO VERTEX synset VALUES($0, $2, $1, $4) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
      }

  CREATE LOADING JOB load_job_words_encoded FOR GRAPH ConceptNet {
        DEFINE FILENAME MyDataSource;
        LOAD MyDataSource TO VERTEX word VALUES($0, $1, $2, $3, $4) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
        LOAD MyDataSource TO EDGE has_pos VALUES($0 word, $2 part_of_speech) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
        LOAD MyDataSource TO EDGE is_pos_of VALUES($2 part_of_speech, $0 word) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
        LOAD MyDataSource TO EDGE has_pos VALUES($0 synset, $2 part_of_speech) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
        LOAD MyDataSource TO EDGE is_pos_of VALUES($2 part_of_speech, $0 synset) USING SEPARATOR=",", HEADER="true", EOL="\n", QUOTE="double";
      }


  END
  ''')
print(results)

Using graph 'ConceptNet'
Successfully created loading jobs: [load_job_relationships].
Successfully created loading jobs: [load_job_pos_wn].
Successfully created loading jobs: [load_job_synsets_encoded].
Successfully created loading jobs: [load_job_words_encoded].


## Load Data

### load relationships

In [68]:
# Load the posts file wiht the 'load_words' job
load_words = '/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/relationships.csv'
results = conn.uploadFile(load_words, timeout=500000,fileTag='MyDataSource', jobName='load_job_relationships')
print(json.dumps(results, indent=2))

[
  {
    "sourceFileName": "Online_POST",
    "statistics": {
      "validLine": 3406450,
      "rejectLine": 0,
      "failedConditionLine": 0,
      "notEnoughToken": 0,
      "invalidJson": 0,
      "oversizeToken": 0,
      "vertex": [],
      "edge": [
        {
          "typeName": "Antonym",
          "validObject": 18020,
          "noIdFound": 0,
          "invalidAttribute": 0,
          "invalidVertexType": 0,
          "invalidPrimaryId": 0,
          "invalidSecondaryId": 0,
          "incorrectFixedBinaryLength": 0,
          "passedCondition": 18020,
          "failedCondition": 3388430,
          "failedConditionLines": [
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
            16,
            17,
            18,
            19,
            20,
            21,
            22,
         

### load pos

In [69]:
# Load the posts file wiht the 'load_words' job
load_words = '/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/encoded/pos_wn.csv'
results = conn.uploadFile(load_words, timeout=100000,fileTag='MyDataSource', jobName='load_job_pos_wn')
print(json.dumps(results, indent=2))

[
  {
    "sourceFileName": "Online_POST",
    "statistics": {
      "validLine": 6,
      "rejectLine": 0,
      "failedConditionLine": 0,
      "notEnoughToken": 0,
      "invalidJson": 0,
      "oversizeToken": 0,
      "vertex": [
        {
          "typeName": "part_of_speech",
          "validObject": 6,
          "noIdFound": 0,
          "invalidAttribute": 0,
          "invalidVertexType": 0,
          "invalidPrimaryId": 0,
          "invalidSecondaryId": 0,
          "incorrectFixedBinaryLength": 0
        }
      ],
      "edge": [],
      "deleteVertex": [],
      "deleteEdge": []
    }
  }
]


### load words and synsets

In [70]:
# Load the posts file wiht the 'load_edges' job
load_edges = '/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/encoded/synsets-encoded.csv'
results = conn.uploadFile(load_edges, timeout=100000, fileTag='MyDataSource', jobName='load_job_synsets_encoded')
print(json.dumps(results, indent=2))

[
  {
    "sourceFileName": "Online_POST",
    "statistics": {
      "validLine": 117660,
      "rejectLine": 0,
      "failedConditionLine": 0,
      "notEnoughToken": 0,
      "invalidJson": 0,
      "oversizeToken": 0,
      "vertex": [
        {
          "typeName": "synset",
          "validObject": 117659,
          "noIdFound": 1,
          "invalidAttribute": 0,
          "invalidVertexType": 0,
          "invalidPrimaryId": 0,
          "invalidSecondaryId": 0,
          "incorrectFixedBinaryLength": 0
        }
      ],
      "edge": [],
      "deleteVertex": [],
      "deleteEdge": []
    }
  }
]


In [71]:
# Load the posts file wiht the 'load_edges' job
load_edges = '/content/drive/MyDrive/LanguageModel/Data_Processing/csv_imports/encoded/words_encoded.csv'
results = conn.uploadFile(load_edges, timeout=100000, fileTag='MyDataSource', jobName='load_job_words_encoded')
print(json.dumps(results, indent=2))

[
  {
    "sourceFileName": "Online_POST",
    "statistics": {
      "validLine": 1530137,
      "rejectLine": 0,
      "failedConditionLine": 0,
      "notEnoughToken": 0,
      "invalidJson": 0,
      "oversizeToken": 0,
      "vertex": [
        {
          "typeName": "word",
          "validObject": 1530135,
          "noIdFound": 2,
          "invalidAttribute": 0,
          "invalidVertexType": 0,
          "invalidPrimaryId": 0,
          "invalidSecondaryId": 0,
          "incorrectFixedBinaryLength": 0
        }
      ],
      "edge": [
        {
          "typeName": "has_pos",
          "validObject": 1530135,
          "noIdFound": 2,
          "invalidAttribute": 0,
          "invalidVertexType": 0,
          "invalidPrimaryId": 0,
          "invalidSecondaryId": 0,
          "incorrectFixedBinaryLength": 0
        },
        {
          "typeName": "has_pos",
          "validObject": 1530135,
          "noIdFound": 2,
          "invalidAttribute": 0,
          "invalidVe

## Exploring the Graph

### Get Vertex and Edge Schema

In [72]:
results = conn.getVertexTypes()
print(f"Verticies: {results}")
vertices = results

results = conn.getEdgeTypes()
print(f"Edges: {results}")
edges = results

Verticies: ['synset', 'word', 'part_of_speech']
Edges: ['has_pos', 'is_pos_of', 'Antonym', 'Synonym', 'RelatedTo']


In [73]:

print(f"Results for Post vertex")
pprint(conn.getVertexType("word"))

print("-----------------")
print(f"Results for liked edge")
pprint(conn.getEdgeType("type"))


Results for Post vertex
{
  "Config": {
    "TAGGABLE": false,
    "STATS": "OUTDEGREE_BY_EDGETYPE",
    "PRIMARY_ID_AS_ATTRIBUTE": true
  },
  "Attributes": [
    {
      "AttributeType": {
        "Name": "STRING"
      },
      "IsPartOfCompositeKey": false,
      "PrimaryIdAsAttribute": false,
      "AttributeName": "name",
      "HasIndex": false,
      "internalAttribute": false,
      "IsPrimaryKey": false
    },
    {
      "AttributeType": {
        "Name": "STRING"
      },
      "IsPartOfCompositeKey": false,
      "PrimaryIdAsAttribute": false,
      "AttributeName": "pos",
      "HasIndex": false,
      "internalAttribute": false,
      "IsPrimaryKey": false
    },
    {
      "AttributeType": {
        "Name": "STRING"
      },
      "IsPartOfCompositeKey": false,
      "PrimaryIdAsAttribute": false,
      "AttributeName": "conceptUri",
      "HasIndex": false,
      "internalAttribute": false,
      "IsPrimaryKey": false
    },
    {
      "AttributeType": {
        "Nam

## Counting Data

In [74]:
print("Vertex Counts")
for vertex in vertices:
  print(f"There are {conn.getVertexCount(vertex)} {vertex} vertices in the graph")

print("--------------")
print("Edge Counts")
for edge in edges:
  print(f"There are {conn.getEdgeCount(edge)} {edge} edges in the graph")

Vertex Counts
There are 1074988 synset vertices in the graph
There are 1317511 word vertices in the graph
There are 8 part_of_speech vertices in the graph
--------------
Edge Counts
There are 926323 has_pos edges in the graph
There are 3060270 is_pos_of edges in the graph
There are 72080 Antonym edges in the graph
There are 860264 Synonym edges in the graph
There are 6301092 RelatedTo edges in the graph


# Clear the Whole Graph
DANGER ZONE

In [77]:
conn.gsql('''
USE GLOBAL
DROP ALL
''')

'Dropping all, about 1 minute ...\nAbort all active loading jobs\nTry to abort all loading jobs on graph ConceptNet, it may take a while ...\n[ABORT_SUCCESS] No active Loading Job to abort.\nResetting GPE...\nSuccessfully reset GPE and GSE\nStopping GPE GSE\nSuccessfully stopped GPE GSE in 0.003 seconds\nClearing graph store...\nSuccessfully cleared graph store\nStarting GPE GSE RESTPP\nSuccessfully started GPE GSE RESTPP in 0.057 seconds\nEverything is dropped.'