In [1]:
from stackapi import StackAPI
import os
import json
import time
import pandas as pd
import numpy as np

# 2 - Preprocess meanings

In [2]:
path = '../Dati wiki/pulito_nodes.csv'

In [3]:
df = pd.read_csv(path, error_bad_lines=False)

In [4]:
df

Unnamed: 0,Tag,Significati
0,fridaysforfuture,"School Strike for Climate Swedish, also known ..."
1,climatestrike,"School Strike for Climate Swedish, also known ..."
2,climateaction,Climate action (or climate change action) refe...
3,climatecrisis,Climate crisis is a term describing global war...
4,climatechange,Contemporary climate change includes both glo...
...,...,...
94,animals,"Animals are multicellular, eukaryotic organism..."
95,governo,Governo is a winemaking technique reportedly i...
96,emobility,EVs first came into existence in the mid-19th ...
97,chile,"Chile, officially the Republic of Chile, is a ..."


In [5]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [6]:
tags = df['Tag']
meaning = df['Significati']

In [7]:
def clean_text(corpus):
    clear_text = []
    for phrase in corpus:
        # Replace URL with the tag url
        phrase = re.sub("https*\S+", "url", phrase)
        # Remove strange characters
        # &quot => ""
        # &#39 => '
        # &amp; => &
        # &lt; => <
        # &gt; => >
        phrase = re.sub("&quot;|\r|\n\|&#39|&amp;|&lt;|&gt;", "", phrase)

        # Punctuaction removal
        phrase = re.sub('[%s]' % re.escape(string.punctuation), ' ', phrase)
        # Replace numbers with the tag number
        phrase = re.sub("\d+", "number ", phrase)
        # Convert to lowercase
        phrase = phrase.lower()
        # Replace the over spaces
        phrase = re.sub('\s{2,}', " ", phrase)
        clear_text.append([phrase])
    return clear_text

In [8]:
cleared_text = clean_text(meaning)

In [9]:
import networkx as nx
import itertools
from operator import itemgetter

In [10]:
G = nx.read_graphml("../DataSet FFF/Graph_data/Real_Network.graphml")

In [11]:
degree_dict = dict(G.degree(G.nodes()))
nx.set_node_attributes(G, degree_dict, 'degree')
sorted_degree = sorted(degree_dict.items(), key = itemgetter(1), reverse = True)

In [12]:
sample_nodes_597 = []
for i in sorted_degree:
    if i[1] > 50:
        sample_nodes_597.append(i[0])
print(len(sample_nodes_597))

257


In [13]:
sample_nodes_1057 = []
for i in sorted_degree:
    if i[1] > 30:
        sample_nodes_1057.append(i[0])
print(len(sample_nodes_1057))

511


In [14]:
def select_sample_meaning(sample, meaning):
    sampled_meaning = []
    sample_nodes = []
    for i in sample:
        for j in range(len(tags)):
            if tags[j]==i:
                sampled_meaning.append(i + " " + meaning[j][0])
                sample_nodes.append(i)
    return sampled_meaning, sample_nodes

In [15]:
sampled_meaning_597, sample_nodes_597 = select_sample_meaning(sample_nodes_597, cleared_text)
sampled_meaning_1057, sample_nodes_1057 = select_sample_meaning(sample_nodes_1057, cleared_text)

In [16]:
"""
for i in range(len(sample_nodes_1057)):
    if not isNaN(tags[i]):
        sampled_meaning_1057[i] = sample_nodes_1057[i] + " " + sampled_meaning_1057[i][0]
    else:
        sampled_meaning_1057[i] = "nan " + sampled_meaning_1057[i][0]
"""

'\nfor i in range(len(sample_nodes_1057)):\n    if not isNaN(tags[i]):\n        sampled_meaning_1057[i] = sample_nodes_1057[i] + " " + sampled_meaning_1057[i][0]\n    else:\n        sampled_meaning_1057[i] = "nan " + sampled_meaning_1057[i][0]\n'

In [17]:
word_tokens = []
for phrase in sampled_meaning_597:
    word_tokens.append(phrase.split())
stop_words = stopwords.words('english')
no_sw_597=[]
for phrase in word_tokens:
    no_sw_tmp=[]
    for word in phrase:
        if word not in stop_words:
            no_sw_tmp.append(word)
    no_sw_597.append(no_sw_tmp)

In [18]:
word_tokens = []
for phrase in sampled_meaning_1057:
    word_tokens.append(phrase.split())
stop_words = stopwords.words('english')
no_sw_1057=[]
for phrase in word_tokens:
    no_sw_tmp=[]
    for word in phrase:
        if word not in stop_words:
            no_sw_tmp.append(word)
    no_sw_1057.append(no_sw_tmp)

In [19]:
cleared_meaning_597 = []
for i in no_sw_597:
    new_s = ""
    for j in i:
        new_s = new_s + " " + j
    cleared_meaning_597.append(new_s)

In [20]:
cleared_meaning_1057 = []
for i in no_sw_1057:
    new_s = ""
    for j in i:
        new_s = new_s + " " + j
    cleared_meaning_1057.append(new_s)

In [21]:
df = pd.DataFrame({"Tags": sample_nodes_597, "Meaning" : cleared_meaning_597})
df.to_csv("nodes_597_meaning.csv", index=False)

In [22]:
df = pd.DataFrame({"Tags": sample_nodes_1057, "Meaning" : cleared_meaning_1057})
df.to_csv("nodes_1057_meaning.csv", index=False)

In [23]:
from gensim.models import Word2Vec

In [24]:
word2vec_597 = Word2Vec(no_sw_597, min_count=1)

In [25]:
len(word2vec_597.wv)

1323

In [26]:
def check_isin(word2vec, data):
    not_in = []
    for i in data:
        try:
            word2vec.wv[i]
        except:
            not_in.append(i)
    print(len(not_in))

In [27]:
word2vec_1057 = Word2Vec(no_sw_1057, min_count=1)

In [28]:
len(word2vec_1057.wv)

1538

# 3 - Store word2vec embedding

In [29]:
from linkpred.evaluation import Pair
import pickle

In [30]:
H_597 = G.subgraph(sample_nodes_597)
H_1057 = G.subgraph(sample_nodes_1057)

In [31]:
first = []
second = []
for i in H_597.edges():
    first.append(word2vec_597.wv[i[0]])
    second.append(word2vec_597.wv[i[1]])
new_df = pd.DataFrame({"first":first, "second":second})
new_df.reset_index(inplace=True, drop=True)
new_df.to_pickle("embedding_pickle_597.csv")

In [32]:
first = []
second = []
for i in H_1057.edges():
    first.append(word2vec_1057.wv[i[0]])
    second.append(word2vec_1057.wv[i[1]])
new_df = pd.DataFrame({"first":first, "second":second})
new_df.reset_index(inplace=True, drop=True)
new_df.to_pickle("embedding_pickle_1057.csv")

In [33]:
word2vec_597.save("word2vec_597.model")

In [34]:
word2vec_1057.save("word2vec_1057.model")

In [35]:
nodes = list(H_597.nodes())
universe_597 = set([Pair(i) for i in itertools.product(nodes, nodes) if i[0]!=i[1]])

In [36]:
nodes = list(H_1057.nodes())
universe_1057 = set([Pair(i) for i in itertools.product(nodes, nodes) if i[0]!=i[1]])

In [37]:
with open('universe_1057.pickle', 'wb') as f:
    pickle.dump(universe_1057, f)
with open('universe_597.pickle', 'wb') as f:
    pickle.dump(universe_597, f)

In [38]:
first = []
second = []
for i in H_597.edges():
    first.append(i[0])
    second.append(i[1])

In [39]:
new_df = pd.DataFrame({"first":first, "second":second})
new_df.reset_index(inplace=True, drop=True)
new_df.to_pickle("edges_original_597.csv")

In [40]:
first = []
second = []
for i in H_1057.edges():
    first.append(i[0])
    second.append(i[1])

In [41]:
new_df = pd.DataFrame({"first":first, "second":second})
new_df.reset_index(inplace=True, drop=True)
new_df.to_pickle("edges_original_1057.csv")

In [42]:
nx.write_graphml(H_597,"H_597.graphml")
nx.write_graphml(H_1057,"H_1057.graphml")

-------------------------------------