### <center>Co-Occurrence Network</center>

+  In this file we will make the co-occurrence network of the data

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from warnings import filterwarnings
from collections import Counter
from pylab import rcParams
filterwarnings(action='ignore', category=DeprecationWarning)
from utils.utils import *

In [2]:
# Importing another notebook
import import_ipynb
import lda_topic_modelling as lda

# Networkx
import networkx as nx

import operator
import seaborn as sns
import nltk
from functools import reduce

rcParams['figure.figsize'] = 15, 15

importing Jupyter notebook from lda_topic_modelling.ipynb


  from imp import reload
  from scipy.linalg.special_matrices import triu
  from .autonotebook import tqdm as notebook_tqdm


Creating a Co-Occurrence Network and plotting it

In [4]:
class HOCNetwork:
    """
         A class used to create the co-occurence network of the speeches, plot more frequent words in a speech,
         plot neighbouring word in the network along with checking the similarity scores.
    """
    
    base_path = r'./assets/images/'

    def get_word_frequency(self, words, n, name):
        """ The function creates plot frequency of word and its count in log form
            words - list of words
            n - top common words
            name - plot name to be saved"""

        # Frequency of most common words
        all_fdist = nltk.FreqDist(words).most_common(n)
        all_fdist = pd.Series(dict(all_fdist))

        # Initailize the subplot
        fig, ax = plt.subplots(figsize=(20,10))
        # Plotting the words
        frpl1 = sns.barplot(x=all_fdist.index, y=all_fdist.values, ax=ax,palette="YlOrBr_r")
        plt.xticks(rotation=45);
        frpl1.set(title='Frequency Distribution',yscale = 'log')
        plt.title(f"Word Frequency of {name}", fontsize=35)
        plt.xlabel(name, fontsize=16)
        plt.ylabel('Counts', fontsize=16)
        plt.savefig(HOCNetwork.base_path + name +'.png')
        plt.show()


    def co_occurence_network(self, list):
        """ The function creates co-occurence network of words using networkx lib and returns the network
            list - list of words"""

        # Initialize network   
        G = nx.Graph()
        for sublist in list:
            edlis=[]
            for i in range(len(sublist)-1):
                edlis.append((sublist[i],sublist[i+1]))
        # Adding edges to the network
            G.add_edges_from(edlis)
        return G

    def get_neighbouring_words(self, word, G):
        """ The function calculates the neighbouring words of the word using the co-occurence network.
            word - word
            G - Network """

        # Finding the neighbours
        neighbour_words = list(G.neighbors(word))
        neighbour_words.append(word)
        return neighbour_words

    def plot_neighbhour_words(self, word, G, neighbour_words):
        """ The function plots the neighbouring words of a network
            word - word
            G - Network """

        H=G.subgraph(neighbour_words)
        d = dict(H.degree)
        plt.figure(3,figsize=(20,10)) 
        plt.title(f"Co-Occurence Network of {word} word", fontsize=35)
        nx.draw(H, with_labels=True, font_size=20, node_size=400, node_color='orange')
        plt.savefig(HOCNetwork.base_path +word+'.png', dpi=300, bbox_inches='tight')
        plt.show()

    def deg_closness_word(self, G):
        """ The function calculates the degree and closeness centraility of the network and returns the dataframe of it.
            G - Network """

        ### Degree and its closeness to words
        degre_=nx.degree_centrality(G)
        clos_=nx.closeness_centrality(G)
        data_frame = pd.DataFrame([degre_,clos_]).transpose()
        data_frame.columns = ["Degree", "Closeness"]

        # Shows the similarity or the word closer to the selected words
        data_frame.sort_values(by=['Closeness'], inplace=True, ascending=False)
        return data_frame


    def get_reduced_network(self, data_lemmatized):
        """ The function reduces the network based on word frequency in a speech and returns the network
            data_lemmatized - lemmatized words"""

        # Filtering words occuring less than 1 times in a document
        reduced_list = [[k for k,v in Counter(sublist).items() if v >1] for sublist in data_lemmatized ]

        # Creating co-occurence network
        G = self.co_occurence_network(reduced_list)
        print(f"No of nodes {G.number_of_nodes()}")
        print(f"No of edges {G.number_of_edges()}")
        return G


    def plot_co_occurence_network(self, G, title_text):
        """ The function used to plot the co-occurence network
            G - network
            title_text - title of the plot """

        # Plotting the co-occurence network
        plt.figure(3,figsize=(20,14)) 
        plt.title(title_text, fontsize=25)
        nx.draw(G, with_labels=True, font_size=10, node_size=300, node_color='orange')
        plt.savefig(HOCNetwork.base_path+title_text+'.png',dpi=300, bbox_inches='tight')
        plt.show()
        
        

    def check_similarity(self, word_previous, word_current):
        """ The function checks the similarity scores of the two word vectors by finding the similar words in the vector.
            word_previous - neighbouring word in previous year
            word_current - neighbouring word in current year """

        # Finding the similar words and their similarity score
        common = [ item for item in word_previous if item in word_current]
        simalarity_score = len(common)/len(word_previous) * 100
        # Return the similarity score
        return simalarity_score
