### <center>Semantic Similarity of Speeches</center>

+ In this file we will look for the semantic similarity of the speeches based on the Document Embedding via Cr5 Model

In [1]:
import pandas as pd
import numpy as np
import operator
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from warnings import filterwarnings
from pylab import rcParams
filterwarnings(action='ignore', category=DeprecationWarning)
filterwarnings(action='ignore', category=FutureWarning)
from utils.utils import *

rcParams['figure.figsize'] = 15, 10

%matplotlib inline

In [2]:
import nltk

import sys
sys.path.append("./models/Cr5-master/src")

from gensim.utils import tokenize
from cr5 import Cr5_Model
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity



In [4]:
class HOCSemanticSimilarity:
    """
         A class used to create the custom topic modelling by using document embeddings via Cr5 model,
         then reducing the embeddding via UMAP and then clustering using KMeans to find the topics.
    """

    base_path = r'./assets/images/'
    def __init__(self, df_head_dt_gp_21, df_head_dt_gp_20):
        
        self.df_head_dt_gp_21 = df_head_dt_gp_21
        self.df_head_dt_gp_20 = df_head_dt_gp_20


    def get_model(self, path):
        """ The function returns the Cr5 model
            path - path to the model
        """
        # Load the model
        model = Cr5_Model(path,'joint_28')
        # Model for en language
        model.load_langs(['en'])

        return model

    def get_cr5_embeddings(self, df, model):
        """ The function returns the Cr5 emdeddings
            df - Dataframe
            model - model
        """

        token_ = [list(tokenize(speech)) for speech in df.speech_processed]
        # Calculate the embeddings for the speeches
        embeddings = [model.get_document_embedding(doc,'en') for doc in token_ ]

        return embeddings


    def get_similarity_data(self, common_heading, model):
        """ The function calculates the pairwise cosine similarity of Cr5 emdeddings for all the common headings
            and returns the similarity data along with the median value of similarity score for each heading       
            common_heading - List of common headings
            model - Cr5 model
        """

        similarities = []
        median_dict = {}
        for heading in common_heading:
            # Get the speech with common heading from the dataframe
            df_common_21 = get_speech_by_heading(self.df_head_dt_gp_21, heading)
            df_common_20 = get_speech_by_heading(self.df_head_dt_gp_20, heading)

            # Get the Cr5 embeddings for the speeches
            common_embedding21 = self.get_cr5_embeddings(df_common_21, model)
            common_embedding20 = self.get_cr5_embeddings(df_common_20, model)

            # Calculate the pairwise cosine similarity of Cr5 emdeddings
            similar_ = cosine_similarity(common_embedding21, common_embedding20)
            data = [d for d in similar_.T]
            # Calculate the median value of similarity score for each heading
            median_dict[heading] = np.median(data)
            similarities.append(data)
            
        return similarities, median_dict

    def create_df(self, data, common_heading):
        """ The function creates a dataframe of the similarity score and returns the dataframe
            data - Similarity Score data     
            common_heading - List of common headings
        """

        df = pd.DataFrame(data=data, columns = ["Similarity Score"], index = common_heading)
        df['Heading'] = common_heading
        
        return df

    def plot_similarity_data(self, df, semantic_dict, name):
        """ The function plots Violin graph for the top and the least semantically similarity heading
            df - Dataframe 
            semantic_dict - Dictionary of semantic similarity  
            name - text
        """

        rcParams['figure.figsize'] = 8, 6

        fig = plt.figure() 
        ax = fig.add_subplot(111)   

        # Plot the violin plot
        sns.violinplot( data=[data for data in df['Similarity Score'].values])
        ax.set_xticklabels(list(semantic_dict.keys()))

        # Add xticks and labels 
        plt.xticks(rotation=20, ha='center')
        plt.xlabel("Speeches in 2020",  fontsize=15)
        plt.ylabel("Similarity Score with Speeches in 2021", fontsize=15)

        text = "Least" if name == 'least_semantic_change' else "Top"
        plt.title(f"{text} Similar Speeches in 2020 and 2021", fontsize=25)
        # Save the plot
        plt.savefig(HOCSemanticSimilarity.base_path+name+'.png')
        plt.show()

