In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import re
import seaborn as sns
import word2number
import community
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib.colors import ListedColormap
from nltk.corpus import wordnet as wn
from word2number import w2n
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords #<- For calling the know stopwords in english (e.g, articles, connectors)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# First apply cleansing

Read in the network and apply the following cleansing:

1. Remove numbers
2. Apply stemming to take root words. This removes words such as compression and compressed, which fundamentally are the same as compress.
3. Stopwords such as "the" and "a".

In [None]:
df = pd.read_csv("synonyms.csv")

df.head()

In [None]:
def numbers(x):
    try:
        return w2n.word_to_num(x)
    except (ValueError, TypeError, IndexError):
        return np.nan

In [None]:
df["number"] = df["lemma"].apply(lambda x: numbers(x))

In [None]:
df["lemma"].unique().shape

In [None]:
th_rows = [f"{i}th" for i in range(1000)]
rd_rows = [f"{i}rd" for i in range(1000)]
nd_rows = [f"{i}nd" for i in range(1000)]
st_rows = [f"{i}st" for i in range(1000)] 
number_rows = [f"number {i}" for i in range(1000)]
years_rows = [f"{i}s" for i in range(0,2021,10)]
days_rows = [f"{i+1} days" for i in range(366)]
final_rows = ["24-hour","-karat"]

final_rows.extend(th_rows)
final_rows.extend(number_rows)
final_rows.extend(rd_rows)
final_rows.extend(nd_rows)
final_rows.extend(st_rows)

In [None]:
no_numbers = df[(df["lemma"].str.isnumeric() == False)&(df["number"].isna() == True)&(df["lemma"].str.contains('|'.join(final_rows)) == False)]

In [None]:
no_numbers["lemma"].unique().shape

In [None]:
ps = PorterStemmer()
no_numbers["stemmed"] = no_numbers["lemma"].apply(lambda w: ps.stem(w) if isinstance(w, str) else None)

In [None]:
no_numbers[no_numbers.duplicated(subset=["stemmed","part_of_speech"]) == True].shape

In [None]:
no_numbers[no_numbers.duplicated(subset=["stemmed","part_of_speech"]) == True]["lemma"].unique().shape

In [None]:
remove_stemming = no_numbers[no_numbers.duplicated(subset=["stemmed","part_of_speech"]) == False]

In [None]:
remove_stemming["lemma"].unique().shape

In [None]:
remove_stemming["split"] = remove_stemming["synonyms"].str.replace("|",";").apply(lambda x: x.split(";") if isinstance(x, str) else None)

Need to pair up words that are synonyms.

In [None]:
remove_stemming.shape

In [None]:
stops = stopwords.words('english')

In [None]:
# filter out obvious stop words such as "have" "and"
no_stops = remove_stemming[remove_stemming["lemma"].apply(lambda x: (x in list(STOPWORDS) or x in stops))==False]

In [None]:
no_stops["lemma"].unique().shape

In [None]:
with_syns = no_stops[no_stops["synonyms"].isna() ==False]

In [None]:
with_syns["lemma"].unique().shape

In [None]:
# create pairs of words that are synonymous for edges file (which will be used in network analysis)
pairs = []

for lem in no_stops["lemma"]:
    
    if with_syns[with_syns.split.apply(lambda x: lem in x)].shape[0] > 0:
        
        for comp_lem in with_syns[with_syns.split.apply(lambda x: lem in x)]["lemma"]:
            if lem != comp_lem:
                comp_pair = [lem, comp_lem]
                pairs.append(comp_pair)

In [None]:
np_pairs = np.array(pairs)
edges_raw = pd.DataFrame(np_pairs)

In [None]:
edges_raw.shape

Now that all pairs are in a dataframe must get rid of duplicates.

In [None]:
df1 = pd.DataFrame(np.sort(edges_raw[[0,1]], axis=1))

edges_raw1 = edges_raw[~df1.duplicated()]

In [None]:
edges_raw1[0].unique().shape

In [None]:
edges_raw1.head()

In [None]:
edges_raw1.rename(columns={0:"Source",1:"Target"},inplace=True)
edges_raw1.to_csv("edges.csv",index=False)

# With edges created, create network of synonyms and perform analysis.

Analysis includes:

1. Degree distribution analysis
2. Analysis of ego network for highest degree node
3. Community detection for ego networks

In [None]:
edges_raw = pd.read_csv("../input/network-edges/edges.csv")

# Convert columns into list of tuples
tup_edges = list(zip(edges_raw["Source"], edges_raw["Target"]))

# # list of nodes and associated connections
edges = [t for t in (set(tuple(i) for i in tup_edges))]

In [None]:
# create networkx graph
Gx = nx.Graph()
Gx.add_edges_from(edges)


In [None]:
degrees = pd.DataFrame([[tup[0], tup[1]]for tup in nx.degree(Gx)])

degrees.rename(mapper={0:"Node ID",1:"Degrees"},inplace=True,axis=1)
degrees.sort_values("Degrees",inplace=True)
degrees.reset_index(inplace=True,drop=True)
degrees.head()

In [None]:
degrees["Degrees"].mean()

In [None]:
fig, ax = plt.subplots(figsize=(20,12)) #<-- Create the subplots

dist = sns.distplot(degrees["Degrees"])
dist.set(xlabel='Degrees', ylabel='Percentage of Nodes')

plt.xlabel('Degree', fontsize=18)
plt.ylabel('Percent of Nodes', fontsize=16)
plt.title("Degree Distribution", fontsize=16)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))

In [None]:
degree = nx.degree_centrality(Gx)
degree_df = pd.DataFrame.from_dict(degree, orient="index", columns = ["Degree Centrality"])

In [None]:
degree_df.sort_values("Degree Centrality",ascending=False).head(5)

From low centrality measure, this indicates many families of synonyms as network is largely disconnected given the large number of nodes

In [None]:
ego_graph = nx.ego_graph(Gx, "pass", radius=2)

In [None]:
layout = nx.spring_layout(ego_graph)
fig, ax = plt.subplots(figsize=(20,12)) #<-- Create the subplots
    
nx.draw_networkx_nodes(ego_graph,layout,node_size=10, alpha=0.5)
nx.draw_networkx_edges(ego_graph,layout, alpha=0.5)
nx.draw_networkx_nodes(ego_graph, layout, nodelist=["pass"], node_color='r',node_size = 100)
ax.axis('off')

In [None]:
# community detection on words directly related to "pass"
depth1 = nx.ego_graph(Gx, "pass", radius=1)
partition = community.best_partition(depth1,resolution=3)

In [None]:
# Show community distribution
communities = pd.DataFrame(Counter(partition.values()),index=[0])
trans = communities.T
trans.rename(mapper={0:"Count"},axis=1,inplace=True)

fig, ax = plt.subplots(figsize=(15,7)) #<-- Create the subplots

sns.barplot(x=trans.sort_values(by=["Count"],ascending=False).head(10).index,y=trans.sort_values(by=["Count"],ascending=False).head(10)["Count"])
ax.set_xlabel('Community')

plt.title("Louvain's Community Detection for Pass Ego Network (depth 1)")
plt.xlabel('Community', fontsize=18)
plt.ylabel('Count', fontsize=16)

Currently these communities do not mean a lot, so further analysis is performed to potentially explain meanings.

In [None]:
community_df = pd.DataFrame.from_dict(partition,orient="index",columns=["Community"])
community_df[community_df["Community"]==0].head(10)

Community 0 appears to be related to dying. This makes sense as a family of words connected to "pass", as one may "pass away".

In [None]:
community_df[community_df["Community"]==1].head(20)

Community 1 is less clear. By expanding the header from 10 to 20, it may indicate passing by something with words such as "clear" and "make pass", however this is tenuous.

Potentially, the community detection algorithm parameters need tuning to split this community further.

In [None]:
community_df[community_df["Community"]==2].head(10)

Community 2 appears to be related to time passing.

Plot the communities on the original ego network.

In [None]:
# plot network with communities highlighted
n_clusters = len(np.unique([partition[key] for key in partition]))

vmin = min([partition[key] for key in partition])
vmax = max([partition[key] for key in partition])

cmap = plt.get_cmap('viridis', n_clusters)


fig, ax = plt.subplots(figsize=(20,12)) #<-- Create the subplots
ax.axis("off")
nx.draw_networkx_nodes(ego_graph,layout,node_size=5, alpha=0.2,)
nx.draw_networkx_edges(ego_graph,layout, alpha=0.5)


nx.draw_networkx_nodes(ego_graph, layout, nodelist = [key for key in partition.keys()], node_size=50,
                       cmap=cmap, node_color=list(partition.values()))
nx.draw_networkx_nodes(ego_graph, layout, nodelist=["pass"], node_color='r',node_size = 100)

sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
sm.set_array([])
cbar = plt.colorbar(sm,ticks=np.unique([partition[key] for key in partition]))

cbar.set_ticklabels(np.arange(len([partition[key] for key in partition])))
cbar.set_label('Community', rotation=90)