In [None]:
import spacy
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
### import data
path = "./data/raw/sem_eval_task6/"
save_path = "./data/converted/sem_eval_task6/"
# filename = "dev_set_task1.json"
filename = "dev_set_task2.json"
# filename = "training_set_task1.json"
# filename = "training_set_task2.json"

text_only = True

with open(path + filename, 'r', encoding="utf-8") as input_file:
    data = input_file.read()
    structure = json.loads(data)

len(structure)

In [None]:
label_path = "./data/raw/sem_eval_task6/valid_label.txt"
with open(label_path, 'r', encoding="utf-8") as input_file:
    categories = input_file.read().strip().split("\n")

In [None]:
nlp = spacy.load("en_core_web_trf")

In [None]:
doc_list = []
data_frame = pd.DataFrame(columns=['class'])
categories_dict = {key : 0 for key in categories}
for entry in structure:
    if text_only:
        doc = nlp(entry["text"])
        doc.cats = {category: 0 for category in categories}
        
        for label in entry["labels"]:
            
            doc.cats[label] = 1
            categories_dict[label] = categories_dict[label] + 1
            data_frame.loc[len(data_frame.index)] = label
        doc_list.append(doc)


In [None]:
### spancat data
data_frame = pd.DataFrame(columns=['class', 'size'])
nlp = spacy.blank("en")
span_key = "sc"
for entry in structure:
    if text_only:
        doc = nlp(entry["text"])
        span_list = []
        for label in entry["labels"]:
            
            tup = doc.char_span(label["start"], label["end"], label["technique"])
            span_list.append(tup)
            data_frame.loc[len(data_frame.index)] = [label["technique"], len(tup)]

        doc.spans[span_key] = span_list




In [None]:
data_frame

In [None]:
np.unique(data_frame['size'])

In [None]:
%matplotlib inline
f, ax = plt.subplots(figsize=(15, 7))
g=sns.barplot(data=data_frame.groupby('class').sum(), x='size', y='class', hue='class', legend=False)
ax = g
for c in ax.containers:
    labels = [f'{v.get_width():.0f}' for v in c]
    ax.bar_label(c, labels=labels, label_type='edge')
# plt.xticks(rotation = 75)
plt.title("Span Class distribution (total token)")
plt.show()


In [None]:
%matplotlib inline
g=sns.histplot(data=data_frame, x ="size", legend=False)

plt.title("Span size distribution")
plt.show()

In [None]:
data_frame.sort_values('class')

In [None]:
%matplotlib inline
g=sns.catplot(data=data_frame.sort_values('class'), y="class", kind="count", hue='class', legend=False)
g.fig.set_size_inches(65,5)
ax = g.facet_axis(0, 0)
for c in ax.containers:
    labels = [f'{v.get_width()}' for v in c]
    ax.bar_label(c, labels=labels, label_type='edge')
plt.title("Span Class distribution (total span)")
plt.show()

In [None]:
categories_dict
sum(categories_dict.values())

In [None]:
print([token for token in doc_list[0]])
[token.lemma_ for token in doc_list[0] if not token.is_stop and not token.is_punct and not token.text.isspace()]

### Distribution check

In [None]:
%matplotlib inline
g=sns.catplot(data=data_frame, y="class", kind="count", hue='class', legend=False)
g.fig.set_size_inches(65,5)
ax = g.facet_axis(0, 0)
for c in ax.containers:
    labels = [f'{v.get_width()}' for v in c]
    ax.bar_label(c, labels=labels, label_type='edge')
plt.title("Class distribution")
plt.show()

In [None]:
data_frame_2 = pd.DataFrame(columns=["text", "word_count", "sentiment_direction", "sentiment_scale", "date"])
word_corpus = []
token_corpus = []
token_corpus_2 = []
counter = Counter()
for i, doc in enumerate(doc_list): 
    words = [token.text
             for token in doc
             if not token.is_stop and not token.is_punct and not token.text.isspace()]
    counter.update(words)
    lemma_list = [token.lemma_
             for token in doc
             if not token.is_stop and not token.is_punct and not token.text.isspace()]
    vecs = [token.has_vector
             for token in doc
             if not token.is_stop and not token.is_punct and not token.text.isspace()]
    pos_list = [token.pos_
             for token in doc
             if not token.is_stop and not token.is_punct and not token.text.isspace()]
    
    tag_list = [token.tag_
             for token in doc
             if not token.is_stop and not token.is_punct and not token.text.isspace()]
    
    vocab_list = [token.vocab
             for token in doc
             if not token.is_stop and not token.is_punct and not token.text.isspace()]
    ent_list = [token.ent_type_
             for token in doc
             if not token.is_stop and not token.is_punct and not token.text.isspace()]
    ### data frame

    word_corpus.append(structure[i]["text"])
    token_corpus.append(lemma_list)
    token_corpus_2.append(words)
counter.most_common(5)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
X = vectorizer.fit_transform(token_corpus) ### use lemma

print(vectorizer.get_feature_names_out())
print(X.shape) # (688, 2784)
names = vectorizer.get_feature_names_out()
X ##### this is count matrix
count_matrix = pd.DataFrame(X.toarray(), columns = names)
### next, get the correlation matrix and plot the network
print(len(count_matrix), " rows")
count_matrix.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx

In [None]:
corr = count_matrix.corr()


In [None]:
links = corr.stack().reset_index()
links.columns = ['var1', 'var2', 'value']
print("all links shape: ", links.shape)


In [None]:
print(links.shape)
print(links.loc[links['value'] > 0.1].shape)
print(links.loc[links['value'] > 0.2].shape)
print(links.loc[links['value'] > 0.5].shape)
print(links.loc[links['value'] > 0.75].shape)
print(links.loc[links['value'] > 0.9].shape)
print(links.loc[links['value'] > 0.95].shape)
print(links.loc[links['value'] > 0.99].shape)
print(links.loc[links['value'] == 1].shape)

In [None]:
import numpy as np

In [None]:
threshold = 0.8
links_filtered=links.loc[ (links['value'] > threshold) & (links['var1'] != links['var2']) ]
print("links_filtered shape: ", links_filtered.shape)

In [None]:
print("correlation distribution")
sns.displot(links_filtered.loc[links_filtered['value'] != 1], x='value', bins=100)

#### Making correlation network

In [None]:
def get_word_list(count_matrix: pd.DataFrame, print_output=False):
    '''
        input: 
            count_matrix: dataframe from the "vectorizing" function
            excel_index: boolean option indicating whether to transform row index into excel row index
            print_output: boolean option indicating whether to print top k most distinct word
        output: a dictionary with key = unique word, value = list of (row index + 2) of where that word shows up
        ex. "bless": [35,47,49,87] means that word "bless" shows up on row 35,47,49,87
        note: add 2 to chage it from index in dataframe to index in excel file
    '''
    k = 15
    count_dict = {}
    word_list = count_matrix.columns.values
    for word in word_list:
        count_dict[word] = (count_matrix.index[count_matrix[word] > 0]).tolist()
    if print_output == True:
        ### print which word is associated with what rows (in excel)
        # print(json.dumps(count_dict, sort_keys=False, indent=4))

        unqiue_count_dict = {k:len(v) for k, v in count_dict.items()}
        df = pd.DataFrame.from_dict(unqiue_count_dict, orient='index')
        df.columns = ['unique_count']
        ax = df.nlargest(k, 'unique_count').plot.bar(title=f"top {k} most frequent word", rot=30) ### as in, if word "we" happens 10 times across 4 different post, thhe value will be '4'
        for p in ax.patches:
            ax.annotate(str(p.get_height()), xy=(p.get_x(), p.get_height()))
        #plt.show()

    return count_dict

In [None]:
count_dict = get_word_list(count_matrix, print_output=True)

In [None]:
deg_as_size = False
plot_type = 'spectral'

print("pair_corr_plot: making network")
G = nx.from_pandas_edgelist(links_filtered, 'var1', 'var2', edge_attr='value')
def assign_color(correlation):
    if correlation <= 0:
        return "#ffa09b"
    elif correlation == 1:
        return "#00e541"
    else:
        return "#9eccb7"

def assign_thickness(correlation, benchmark_thickness=6, scaling_factor=2):
    return benchmark_thickness * abs(correlation)**scaling_factor

edge_color = []
edge_width = []
for key, value in nx.get_edge_attributes(G, 'value').items():
    edge_color.append(assign_color(value))
    edge_width.append(assign_thickness(value))
node_size = []
word_dict = get_word_list(count_matrix)
print("pair_corr_plot: adjusting node size...")
if deg_as_size == True:
    print("using degree as node size...")
for key, value in dict(G.degree).items():
    
    if deg_as_size == True:
        ### value is the degree of the key (a word)
        scaling_factor = 50
        node_size.append(value * scaling_factor)
    else:
        if plot_type == 'spectral':
            scaling_factor = 100
            node_size.append(2*(len(word_dict[key])**2) * scaling_factor)
        elif plot_type == 'circular':
                scaling_factor = 8
                node_size.append(2*(len(word_dict[key])**2) * scaling_factor)
        else:
            print("plot type is not recognized. received ", plot_type)
            exit(-1)
        
### trying to draw with networkx
# if plot_type == 'spectral':
#     plt.figure(figsize=(70,70))
#     pos = nx.spring_layout(G, k=0.18, iterations=30)
# elif plot_type == 'circular':
#     plt.figure(figsize=(12,8))
#     pos = nx.circular_layout(G)
# else:
#     print("plot type is not recognized. received ", plot_type)
#     exit(-1)
# print("pair_corr_plot: drawing network")
# nx.draw(G, with_labels=True, pos = pos,node_size=node_size, linewidths=0.5, font_size=15, edge_color=edge_color, width=edge_width)

In [None]:
for size in [70,100,150]:
# for size in [10]:
    # for k in [0.15, 0.18, 0.2, 0.22, 0.25]:
    for k in [0.05,0.08,0.1,0.12]:
        print("pair_corr_plot: making plot/plot_%.3f_in_%dx%d.png" % (k, size, size))
        plt.figure(figsize=(size,size))
        pos = nx.spring_layout(G, k=k, iterations=30)
        nx.draw(G, with_labels=True, pos = pos,node_size=node_size, linewidths=0.5, font_size=15, edge_color=edge_color, width=edge_width)
        plt.savefig("plot/plot_%.3f_in_%dx%d.png" % (k, size, size), dpi=10)
        plt.close()
        # plt.savefig("plot/plot_%.3f_in_%dx%d.png" % (k, size, size), dpi=100)