### Script to create numerical encoding for categorical columns and create separate files for the encoded categories.

- Outputs:

    csv_imports/encoded

In [1]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import collections

import pandas as pd 

df = pd.read_csv('csv_imports/WN-edges.csv',index_col=[0])
print('Reading edges:\n')
print(df.head(1))
print()

# extract type codes (unique for every type).
TYPE_CAT = df[':TYPE'].unique()
print(TYPE_CAT)

df[':TYPE'] = df[':TYPE'].astype('category')
TYPE = df[':TYPE'].cat.codes.unique()
print('type code generated =>type_wn.csv')
print(TYPE)
print()

df1=pd.DataFrame(TYPE_CAT,columns=['type'])
df2=pd.DataFrame(TYPE,columns=['id'])
# print(df1)
# print(df2)

df_concat = pd.concat([df1,df2], axis=1)
df_concat.to_csv('csv_imports/encoded/type_wn.csv')

#encode the column type
# integer encode
label_encoder = LabelEncoder()
df[':TYPE'] = label_encoder.fit_transform(df[':TYPE'])
pd.DataFrame(df).to_csv('csv_imports/encoded/WN-edges-encoded.csv')
print('Numerical encoding completed: Edges(TYPE) => wn-edges-encoded.csv\n')
print(df.head(1))
print()

Reading edges:

               :END_ID      :TYPE
:START_ID                        
00001740-a  05207437-n  attribute

['attribute' 'hyponym' 'entails' 'similar' 'hypernym' 'domain_topic'
 'mero_part' 'exemplifies' 'has_domain_topic' 'also' 'mero_substance'
 'domain_region' 'holo_part' 'holo_member' 'causes' 'instance_hyponym'
 'instance_hypernym' 'mero_member' 'is_exemplified_by' 'holo_substance'
 'has_domain_region' 'antonym']
type code generated =>type_wn.csv
[ 2 14  6 21 13  5 19  7  9  0 20  4 11 10  3 16 15 18 17 12  8  1]

Numerical encoding completed: Edges(TYPE) => wn-edges-encoded.csv

               :END_ID  :TYPE
:START_ID                    
00001740-a  05207437-n      2



In [2]:
df = pd.read_csv('csv_imports/WN-nodes.csv',index_col=[0])
print('Reading nodes:\n')
print(df.head(1))
print()

# extract type codes (unique for every type).
TYPE_CAT = df['pos'].unique()
print(TYPE_CAT)

df['pos'] = df['pos'].astype('category')
TYPE = df['pos'].cat.codes.unique()
print('type code generated =>pos_wn.csv')
print(TYPE)
print()

df1=pd.DataFrame(TYPE_CAT,columns=['type'])
df2=pd.DataFrame(TYPE,columns=['id'])
# print(df1)
# print(df2)

df_concat = pd.concat([df1,df2], axis=1)
df_concat.to_csv('csv_imports/encoded/pos_wn.csv')

#encode the column type
# integer encode
label_encoder = LabelEncoder()
df['pos'] = label_encoder.fit_transform(df['pos'])
pd.DataFrame(df).to_csv('csv_imports/encoded/WN-nodes-encoded.csv')
print('Numerical encoding completed: Nodes(pos) => wn-nodes-encoded.csv\n')
print(df.head(1))
print()

Reading nodes:

                                                        id  word        pos  \
uri                                                                           
http://wordnet-rdf.princeton.edu/id/00001740-a  00001740-a  able  adjective   

                                                                                       definition  \
uri                                                                                                 
http://wordnet-rdf.princeton.edu/id/00001740-a  (usually followed by `to') having the necessar...   

                                                subject  
uri                                                      
http://wordnet-rdf.princeton.edu/id/00001740-a  adj.all  

['adjective' 'noun' 'adverb' 'verb' 'adjective_satellite']
type code generated =>pos_wn.csv
[0 3 2 4 1]

Numerical encoding completed: Nodes(pos) => wn-nodes-encoded.csv

                                                        id  word  pos  \
uri                    

In [3]:
df = pd.read_csv('csv_imports/encoded/WN-nodes-encoded.csv',index_col=[0])
print('Reading nodes:\n')
print(df.head(1))
print()

# extract type codes (unique for every type).
TYPE_CAT = df['subject'].unique()
print(TYPE_CAT)

df['subject'] = df['subject'].astype('category')
TYPE = df['subject'].cat.codes.unique()
print('type code generated =>subject_wn.csv')
print(TYPE)
print()

df1=pd.DataFrame(TYPE_CAT,columns=['type'])
df2=pd.DataFrame(TYPE,columns=['id'])
# print(df1)
# print(df2)

df_concat = pd.concat([df1,df2], axis=1)
df_concat.to_csv('csv_imports/encoded/subject_wn.csv')

#encode the column type
# integer encode
label_encoder = LabelEncoder()
df['subject'] = label_encoder.fit_transform(df['subject'])
pd.DataFrame(df).to_csv('csv_imports/encoded/WN-nodes-encoded.csv')
print('Numerical encoding completed: Nodes(Subject) => wn-nodes-encoded.csv\n')
print(df.head(1))
print()

Reading nodes:

                                                        id  word  pos  \
uri                                                                     
http://wordnet-rdf.princeton.edu/id/00001740-a  00001740-a  able    0   

                                                                                       definition  \
uri                                                                                                 
http://wordnet-rdf.princeton.edu/id/00001740-a  (usually followed by `to') having the necessar...   

                                                subject  
uri                                                      
http://wordnet-rdf.princeton.edu/id/00001740-a  adj.all  

['adj.all' 'noun.Tops' 'adv.all' 'verb.body' 'noun.act' 'verb.change'
 'verb.cognition' 'verb.communication' 'verb.competition'
 'verb.consumption' 'verb.contact' 'noun.animal' 'verb.creation'
 'verb.emotion' 'verb.motion' 'verb.perception' 'verb.possession'
 'verb.social' 'adj.pert' 

In [4]:
df = pd.read_csv('csv_imports/synsets.csv',index_col=[0])
print('Reading nodes:\n')
print(df.head(1))
print()

# extract type codes (unique for every type).
TYPE_CAT = df['pos:string'].unique()
print(TYPE_CAT)

df['pos:string'] = df['pos:string'].astype('category')
TYPE = df['pos:string'].cat.codes.unique()
print('type code generated =>pos_synset.csv')
print(TYPE)
print()

df1=pd.DataFrame(TYPE_CAT,columns=['type'])
df2=pd.DataFrame(TYPE,columns=['id'])
print(df1)
print(df2)

df_concat = pd.concat([df1,df2], axis=1)
df_concat.to_csv('csv_imports/encoded/pos_synset.csv')


#encode the column type
# integer encode
label_encoder = LabelEncoder()
df['pos:string'] = label_encoder.fit_transform(df['pos:string'])
pd.DataFrame(df).to_csv('csv_imports/encoded/synsets-encoded.csv')
print('Numerical encoding completed: Synset(pos) => synsets-encoded.csv\n')
print(df.head(1))
print()

Reading nodes:

          pos:string                                  definition:string  \
id:ID                                                                     
able.a.01          a  (usually followed by `to') having the necessar...   

           :LABEL  
id:ID              
able.a.01  Synset  

['a' 's' 'r' 'n' 'v']
type code generated =>pos_synset.csv
[0 3 2 1 4]

  type
0    a
1    s
2    r
3    n
4    v
   id
0   0
1   3
2   2
3   1
4   4
Numerical encoding completed: Synset(pos) => synsets-encoded.csv

           pos:string                                  definition:string  \
id:ID                                                                      
able.a.01           0  (usually followed by `to') having the necessar...   

           :LABEL  
id:ID              
able.a.01  Synset  



In [5]:
df = pd.read_csv('csv_imports/relationships.csv',index_col=[0])
print('Reading nodes:\n')
print(df.head(1))
print()

# extract type codes (unique for every type).
TYPE_CAT = df['dataset:string'].unique()
print(TYPE_CAT)

df['dataset:string'] = df['dataset:string'].astype('category')
TYPE = df['dataset:string'].cat.codes.unique()
print('type code generated =>dataset_rel.csv')
print(TYPE)
print()

df1=pd.DataFrame(TYPE_CAT,columns=['type'])
df2=pd.DataFrame(TYPE,columns=['id'])
print(df1)
print(df2)

df_concat = pd.concat([df1,df2], axis=1)
df_concat.to_csv('csv_imports/encoded/dataset_rel.csv')


#encode the column type
# integer encode
label_encoder = LabelEncoder()
df['dataset:string'] = label_encoder.fit_transform(df['dataset:string'])
pd.DataFrame(df).to_csv('csv_imports/encoded/relationships_encoded.csv')
print('Numerical encoding completed: Synset(pos) => relationships_encoded.csv\n')
print(df.head(1))
print()

Reading nodes:

                :END_ID  dataset:string  weight:double      :TYPE
:START_ID                                                        
able.a.01  ability.n.01  /d/wordnet/3.1            2.0  Attribute

['/d/wordnet/3.1' '/d/wiktionary/fr' '/d/wiktionary/en' '/d/verbosity'
 '/d/conceptnet/4/en' '/d/dbpedia/en' '/d/wiktionary/de' '/d/opencyc']
type code generated =>dataset_rel.csv
[7 6 5 3 0 1 4 2]

                 type
0      /d/wordnet/3.1
1    /d/wiktionary/fr
2    /d/wiktionary/en
3        /d/verbosity
4  /d/conceptnet/4/en
5       /d/dbpedia/en
6    /d/wiktionary/de
7          /d/opencyc
   id
0   7
1   6
2   5
3   3
4   0
5   1
6   4
7   2
Numerical encoding completed: Synset(pos) => relationships_encoded.csv

                :END_ID  dataset:string  weight:double      :TYPE
:START_ID                                                        
able.a.01  ability.n.01               7            2.0  Attribute



In [6]:
df = pd.read_csv('csv_imports/encoded/relationships_encoded.csv',index_col=[0])
print('Reading nodes:\n')
print(df.head(1))
print()

# extract type codes (unique for every type).
TYPE_CAT = df[':TYPE'].unique()
print(TYPE_CAT)

df[':TYPE'] = df[':TYPE'].astype('category')
TYPE = df[':TYPE'].cat.codes.unique()
print('type code generated =>type_rel.csv')
print(TYPE)
print()

df1=pd.DataFrame(TYPE_CAT,columns=['type'])
df2=pd.DataFrame(TYPE,columns=['id'])
print(df1)
print(df2)

df_concat = pd.concat([df1,df2], axis=1)
df_concat.to_csv('csv_imports/encoded/type_rel.csv')


#encode the column type
# integer encode
label_encoder = LabelEncoder()
df[':TYPE'] = label_encoder.fit_transform(df[':TYPE'])
pd.DataFrame(df).to_csv('csv_imports/encoded/relationships_encoded.csv')
print('Numerical encoding completed: relationships(:TYPE) => relationships_encoded.csv\n')
print(df.head(1))
print()

Reading nodes:

                :END_ID  dataset:string  weight:double      :TYPE
:START_ID                                                        
able.a.01  ability.n.01               7            2.0  Attribute

['Attribute' 'InSynset' 'Domain' 'SimilarTo' 'AlsoSee' 'IsA' 'PartOf'
 'Entailment' 'VerbGroup' 'Cause' 'Antonym' 'AtLocation' 'CapableOf'
 'Causes' 'CausesDesire' 'CreatedBy' 'DefinedAs' 'DerivedFrom' 'Desires'
 'DistinctFrom' 'Entails' 'EtymologicallyDerivedFrom'
 'EtymologicallyRelatedTo' 'FormOf' 'HasA' 'HasContext' 'HasFirstSubevent'
 'HasLastSubevent' 'HasPrerequisite' 'HasProperty' 'HasSubevent'
 'InstanceOf' 'LocatedNear' 'MadeOf' 'MannerOf' 'MotivatedByGoal'
 'NotCapableOf' 'NotDesires' 'NotHasProperty' 'ReceivesAction' 'RelatedTo'
 'SymbolOf' 'Synonym' 'UsedFor' 'dbpedia/capital' 'dbpedia/field'
 'dbpedia/genre' 'dbpedia/genus' 'dbpedia/influencedBy' 'dbpedia/knownFor'
 'dbpedia/language' 'dbpedia/leader' 'dbpedia/occupation'
 'dbpedia/product']
type code generated

In [7]:
df = pd.read_csv('csv_imports/words.csv',index_col=[0])
print('Reading nodes:\n')
print(df.head(1))
print()

# extract type codes (unique for every type).
TYPE_CAT = df['pos'].unique()
print(TYPE_CAT)

df['pos'] = df['pos'].astype('category')
TYPE = df['pos'].cat.codes.unique()
print('type code generated =>pos_words.csv')
print(TYPE)
print()

df1=pd.DataFrame(TYPE_CAT,columns=['type'])
df2=pd.DataFrame(TYPE,columns=['id'])
print(df1)
print(df2)

df_concat = pd.concat([df1,df2], axis=1)
df_concat.to_csv('csv_imports/encoded/pos_words.csv')


#encode the column type
# integer encode
label_encoder = LabelEncoder()
df['pos'] = label_encoder.fit_transform(df['pos'])
pd.DataFrame(df).to_csv('csv_imports/encoded/words_encoded.csv')
print('Numerical encoding completed: words(:TYPE) => words_encoded.csv\n')
print(df.head(1))
print()

Reading nodes:

        name pos    conceptUri         :LABEL
id:ID                                        
able.a  able   a  /c/en/able/a  Lemma;Concept

['a' 's' 'r' 'n' 'v' nan]
type code generated =>pos_words.csv
[ 0  3  2  1  4 -1]

  type
0    a
1    s
2    r
3    n
4    v
5  NaN
   id
0   0
1   3
2   2
3   1
4   4
5  -1
Numerical encoding completed: words(:TYPE) => words_encoded.csv

        name  pos    conceptUri         :LABEL
id:ID                                         
able.a  able    0  /c/en/able/a  Lemma;Concept

