# Mood Metatagging Script

A useful script to extract metatags for any given movie input. The script identifies all adjectives from blocks of text in scraped html files and employs an iterative synonym matching process based on manually coded word vectors. The html files in question are stored on Google Cloud Platform.

In [None]:
import gcp_access
from bs4 import BeautifulSoup
import re
import spacy
import pandas as pd
from sklearn.metrics import pairwise_distances

In [None]:
#!python -m spacy download en_core_web_trf
nlp = spacy.load("en_core_web_lg")

In [None]:
class mood_tag:
    
    def __init__(self, imdb_id, source, coded_word_list, irrelevant_word_list):
        """
        Inputs:
        - imdb_id: a list of imdb id's in string format
        - source: a list of web sources to extract tags from, such as 'wikipedia' or 'fandom'
        - coded_word_list: manually coded word list in csv format, with the second column named 'keyword'
        - irrelevant_word_list: a list of words excluded from the tagging process
        """
        # get keyword, mood, and genes from coded_word_list
        self.df = pd.read_csv(coded_word_list, header=0, index_col=0)
        self.keyword = self.df[self.df['keyword']==1]['keyword'].index.tolist()
        self.mood = sorted(self.df[self.df['keyword']==0]['keyword'].index.tolist())
        #self.gene = self.df.columns[self.df.columns.difference(['keyword'])].tolist()
        
        # get a dictionary of pretrained vectors
        self.pretrained_vectors = {word: nlp(word) for word in self.mood}

        # get list of irrelevant words
        self.irrelevant_words = pd.read_csv(irrelevant_word_list, header=None)[0].tolist()

        # initialize imdb_id and source
        self.imdb_id = imdb_id
        self.source = source

        # initialize similarity matrix
        self.sim = self.build_sim()
        
        # initialize gcp_access
        self.gcp = gcp_access.gcp()

    def build_sim(self):
        """Build a cosine similarity matrix based on coded word vectors"""
        mat = self.df[self.df.columns.difference(['keyword'])].values
        sim = pd.DataFrame(1 - pairwise_distances(mat, metric="cosine"))
        sim.columns = self.df.index[~self.df.index.isin(['keyword'])]
        sim.index = sim.columns
        # print(sim)        
        return sim


    def process_text(self, imdb_id, filepath):    
        """Download and process text from GCP"""
        try:
            html = ' '.join(self.gcp.read_html_by_filepath(filepath))
            # print(html)
            soup = BeautifulSoup(html, "html.parser")
            text = ' '.join([content.text for content in soup.findAll('p')])
            # print(text)
        except:
            return 0

        # extract adj
        tokens = nlp(text)
        adj = [token.text.lower() for token in tokens if token.pos_ == 'ADJ' and token.text.lower() not in self.irrelevant_words]
        # print(adj)
        
        # search for genre "key" words
        noun = re.findall('|'.join(self.keyword), text.lower())

        # return word list
        word_list = list(set(adj + noun))
        if len(word_list) > 0:
            return word_list
        else:
            return 0


    def extract_adj(self, output_filepath):
        """Find final list of adj for each movie and write to a csv file"""
        df = self.gcp.read_master_csv()
        df = df.loc[(df['imdb_id'].isin(imdb_id)) & (df['source'].isin(['fandom', 'wikipedia'])), ['imdb_id', 'filepath']]
        # print(df)
        imdb = df['imdb_id'].unique().tolist()
        for i in range(len(imdb)):
            filepath = df[df['imdb_id']==imdb[i]]['filepath'].tolist()
            adj = self.process_text(imdb[i], filepath)
            if adj:
                nested_list = [self.find_knn(word) for word in adj]      
                final_list = list(set([j for i in nested_list for j in i]))
                # print(final_list)
                with open(output_filepath, "a") as file:
                    file.write(imdb[i] + "," + '|'.join(final_list) +"\n")
            else:
                print('no adj for ' + imdb[i])


    def find_knn(self, word):
        """Find relevant synonyms based on manually coded word vectors and the cosine similarity matrx"""
        
        # keep record of a list of new words (words that haven't been manually coded) and their synonym mappings
        global new_word

        try:
            matches = self.sim.loc[self.sim[word]==1, word].index.tolist()
            if len(matches) < 10:
                # take neighbors with cosine similarity >= threshold
                synonyms = self.sim.loc[self.sim[word] >= 0.50, word].index.tolist()
                return synonyms
            return matches

        except:
            # synonym matching of unknown word
            vector = nlp(word)
            similarities = [vector.similarity(self.pretrained_vectors[vectors]) for vectors in self.mood]
            # print(similarities)
            top_sim_value = sorted(similarities)[-1]
            top_sim_index = similarities.index(top_sim_value)
            top_sim = self.mood[top_sim_index]
            # record our customized one-hot encoding for new vocab 
            if top_sim_value > 4.0:
                new_word[word] = self.df.loc[top_sim].values.tolist()
                return [word] + find_knn(top_sim)
            else:
                return []

In [None]:
if __name__ == '__main__':
    # create a dictionary called new_word
    new_word = dict()
    
    # specify input parameters
    imdb_id = None
    source = ['fandom', 'wikipedia']
    output_filepath = None
    
    # instantiate a mood_tag object
    a = mood_tag(imdb_id, source)
    
    # write mood tags for each movie to a csv file
    a.extract_adj(output_filepath)