### imports

In [1]:
from preprocessing_functions import *
from word_embedding_functions import *

import pandas as pd
import numpy as np
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

- Clean and preprocess (lemmatise) a list of documents (e.g. paragraphs)
- Get your topics through unsupervised clustering with (LDA Topic Modeling)
- Use these for the word embedding algorithm

# Topic Words

In [209]:
# Removing words with double meaning helps (e.g. bear, degree)

#fashion_words = ['fashion', 'model', 'vogue', 'store', 'designer', 'couture', 'catwalk', 'runway', 'modeling', 'clothing', 'cosmetic', 'brand', 'retail', 'advertising', 'perfume']
diplomacy_words = ['War', 'Embassy', 'Army', 'Diplomatic', 'Ambassador', 'Treaty', 'Protest', 'Force', 'Mission', 'Arrest', 'Government', 'Police', 'Attack', 'Party', 'Minister']
entertainment_words = ['Opera', 'Festival', 'Perform', 'Orchestra', 'Symphony', 'Concert', 'Music', 'Film', 'Sing', 'Theatre', 'Performance', 'Role', 'Premiere', 'Tour', 'Band']
art_words = ['Exhibition', 'Art', 'Museum', 'Gallery', 'Exhibit', 'Painting', 'Collection', 'Paint', 'Portrait', 'Artist', 'Sculpture', 'Fashion', 'Design', 'Contemporary', 'Painter']
# education_words = ['Study', 'School', 'Professor', 'University', 'Graduate', 'Educate', 'Lecture', 'Research', 'College', 'Teach', 'Science', 'Education', 'Philosophy', 'Doctorate', 'Faculty'] # replaced bear with educate and degree with Lecture
transportation_words = ['Railway', 'Route', 'Line', 'Operate', 'Flight', 'Station', 'Service', 'Airline', 'Airport', 'Train', 'Passenger', 'Speed', 'Aircraft', 'Rail', 'Network']
sport_words = ['Final', 'Win', 'Team', 'Match', 'Game', 'Goal', 'Club', 'League', 'Champion', 'Championship', 'Season', 'Score', 'Round', 'Tournament', 'Football']

lda_topic_words = { #'fashion': fashion_words,
                    'diplomacy': diplomacy_words,
                   'entertainment': entertainment_words,
                   'art': art_words,
                   # 'education': education_words,
                   'transportation': transportation_words,
                   'sport': sport_words}

### 1. Loading GloVe Word Embedding

In [48]:
def load_glove_word_embeddings(GLOVE_PATH='.../.../.../.../../glove.42B.300d.txt'):
    """
    --> Function that loads glove word embeddings.

        Parameters:
        -----------
            GLOVE_PATH: Str -> Path to the GloVe file

    """
    if not os.path.exists(GLOVE_PATH):
        raise Exception("The given PATH to the GloVe file doesn't exist.")
    
    
    embeddings_dict = {}
    discarded_dict = {}
    print('This will take approximately ~ 4 minutes...')

    num_lines = sum(1 for line in open(GLOVE_PATH,'r', encoding="utf-8"))

    with open(GLOVE_PATH, 'r', encoding="utf-8") as f:
        for line in tqdm(f, total=num_lines):
            values = line.split()
            token = values[0]
            try:
                vector = np.asarray(values[1:], "float32")
                if vector.shape[0] == 300:
                    embeddings_dict[token] = vector
                else:
                    discarded_dict[token] = vector
            except:
                discarded_dict[token] = None
    
    return embeddings_dict, discarded_dict

In [75]:
%%time

embeddings_dict, discarded_dict = load_glove_word_embeddings(GLOVE_PATH="../../../../../glove.840B.300d.txt")

This will take approximately ~ 4 minutes...


  0%|          | 0/2196017 [00:00<?, ?it/s]

CPU times: total: 3min 26s
Wall time: 4min 10s


### 2. Get Mean of the words from each Topic

In [218]:
mean_vectors_dict = {}
for topic in lda_topic_words:
    words = lda_topic_words[topic]
    words = [word for word in words if word in embeddings_dict.keys()] # checks if word is in vocabulary (i.e. has been seen by the model before)
    mean_embedding = np.mean([embeddings_dict[word] for word in words], axis=0)
    mean_vectors_dict[topic] = mean_embedding

#### Show closest words to topic vector

In [53]:
from scipy import spatial
from preprocessing_functions import *

def find_closest_embeddings(embedding, cutoff=25):
    return sorted(embeddings_dict.keys(), key=lambda token: spatial.distance.euclidean(embeddings_dict[token], embedding))

In [130]:
topic = 'fashion'

words = find_closest_embeddings(embedding=
     mean_vectors_dict[topic]    # embeddings_dict['diplomacy'] # embeddings_dict['fashion']
)[:2000]

CPU times: total: 0 ns
Wall time: 0 ns


In [114]:
english_words = get_english_words(path='../../../input/english_words_alpha_370k.txt
# print(remove_non_existing_words_from_wordlist(words, english_words))

### 3. Load Documents

In [210]:
from ast import literal_eval

df = pd.read_csv("../../../../../data/clean/classified_435_citypairs_311k_paragraphs.csv")
df['merged_POS'] = df['merged_POS'].apply(literal_eval)

In [13]:
# def categorize_text(lemmatized_wordlist, mean_vectors_dict, keywords, embeddings_dict, bottom_threshold=0.1, verbose1=False, verbose2=False):
#     """
#     --> Function that loads glove word embeddings.

#         Parameters:
#         -----------
#             lemmatized_words: List -> List of words.
#             keywords: nested list -> List of lists of keywords that represent categories.
#             number_of_keywords: Int (default = 1) -> number of keywords to use from a category (setting it to 0 will use them all!)
#             bottom_threshold: Float (default = 0.1) -> Lowest allowed similarity value between a word and dominant category.
#             verbose1: Bool (default = False) -> Shows similarity calculations between a word and each keyword.
#             verbose2: Bool (default = False) -> Shows similarity calculations between a word and each category.

#     """
#     similar_categories = []
    
#     for word in lemmatized_wordlist:
#         try:
#             word_vector = embeddings_dict[word]
#         except:
#             continue
            
#         if verbose1 or verbose2:
#             print(f"word: \t\t'{word}'")

#         closeness = []

#         for category in keywords:
#             try:
#                 keyword_vector = mean_vectors_dict[category]
#             except:
#                 keyword_vector = embeddings_dict[category]
                
#             similarity = 1 - cosine(keyword_vector, word_vector)

#             closeness.append((similarity, category))

#             if verbose1:
#                 # print('___________________________')
#                 print('===>', '\t\tcategory:', category, '\n\t\tsimilarity:', similarity, f"\n")


#         similar_category = max(closeness)

#         sortedcat = sorted(closeness, key=lambda item: item[0], reverse=True)
#         if (sortedcat[0][0] - 0.05) > sortedcat[2][0]:
#             allowed = True
#         else:
#             allowed = False



#         if similar_category[0] > bottom_threshold and allowed:
#             similar_categories.append((word, similar_category))
#             if verbose2:
#                 #print('category similarity:')
#                 # pprint(sorted(closeness, key=lambda x: x[0], reverse=True))
#                 print(f"choice: \tkept")
#                 print(f"\n==> \tcategory:, {similar_category[1]}, \n\tsimilarity score: {similar_category[0]}")
#         elif verbose2:
#             print(f"choice: \tdiscarded")
#             reason = 'ambiguity' if not allowed else 'low similarity score'
#             print(f"reasoning: \t{reason}")

#         if verbose2:
#             # print(f"analysis: {'not' if not allowed else ''} enough difference\nscores:")
#             print("\nscores:\n\t----category----          ----score----")
#             for i in sortedcat:
#                 print(f"\t{i[1]:<10s} \t\t{i[0]}")
#             print()

#             print('='*100)
#             print('')
    
#     categories_dict = {key: 0 for key in keywords}

#     for x in similar_categories:
#         categories_dict[x[1][1]] += x[1][0] #print(x[1])
    

#     nonsorted_results = list(sorted(categories_dict.items(), key=lambda item: item[0], reverse=False))
#     results = list(sorted(categories_dict.items(), key=lambda item: item[1], reverse=True))
#     #pprint(results)

#     # print(f"\nThe dominant category is: '{results[0][0]}'", end='')
#     #if (results[0][1] - (float(results[0][1])/5)) <= results[1][1]:  
#     #    print(f", closely followed by: '{results[1][0]}'.")
#     if verbose2:
#         print('\n')
#         pprint(similar_categories)
#     # print('\n --------------------------------------------------------------------')
    
#     prediction_dict = {'category_similarities': nonsorted_results, 'prediction': results[0][0]} 

#     return prediction_dict

In [323]:
sample = df[df['idxmax'] != 'Education'][:5000]

### 3. Classify Paragraphs (by Word Embedding Algorithm)

#### 3.1 Select right parameters

In [324]:
bottom_threshold = 0.20
verbose1 = False
verbose2 = False

In [325]:
%time

topics = [key for key in list(sorted(mean_vectors_dict.keys()))]
nested_l = [['index']+topics+['outcome']]

for idx, row in tqdm(sample['merged_POS'].iteritems(), total=len(sample['merged_POS'])):
    output = categorize_text(lemmatized_wordlist=row, mean_vectors_dict=mean_vectors_dict, keywords=topics, embeddings_dict=embeddings_dict, bottom_threshold=bottom_threshold, verbose1=verbose1, verbose2=verbose2)
    
    temp_l = [idx] +[result[1] for result in output['category_similarities']] + [output['prediction']]
    
    #print(row, output['category_similarities'])
    
    if len(nested_l[0]) != len(temp_l):
        raise Exception('Not the same size!')
        
    nested_l.append(temp_l)


prediction_df = pd.DataFrame(nested_l[1:],columns=nested_l[0]).set_index('index')

CPU times: total: 0 ns
Wall time: 1.98 ms


  0%|          | 0/5000 [00:00<?, ?it/s]

In [326]:
prediction_df.head(5)

Unnamed: 0_level_0,art,diplomacy,entertainment,sport,transportation,outcome
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0,2.326078,0.301628,0.768217,0.609732,diplomacy
1,0.26583,1.243361,0.32219,0.425173,0.0,diplomacy
2,0.0,0.893548,0.0,0.21785,3.286044,transportation
3,0.75122,0.405813,0.301628,0.752151,0.0,sport
4,0.0,0.0,0.0,0.0,0.335578,transportation


### 4. Insert classification into dataset

In [327]:
updated_df = sample.join(prediction_df)
updated_df.head(2)
updated_df['same_categorisation'] = (updated_df['idxmax'].str.lower() == updated_df['outcome'])

#### Similarity between lda topic model and word embedding algorithm output

In [328]:
updated_df['same_categorisation'].sum()

4102

In [329]:
# Number of lemmatised words in the paragraphs
updated_df[(updated_df['max'] > 0.9)]['merged_POS'].str.len().value_counts(bins=[0, 10, 20, 30, 40])

(10.0, 20.0]      611
(20.0, 30.0]      544
(30.0, 40.0]      426
(-0.001, 10.0]    338
Name: merged_POS, dtype: int64

In [338]:
print(updated_df[(updated_df['max'] > 0.9) & (updated_df['merged_POS'].str.len() > 5)].groupby('idxmax')['same_categorisation'].value_counts())
print('----------------------------------------------------')
print(updated_df[(updated_df['max'] > 0.9) & (updated_df['merged_POS'].str.len() > 5)].groupby('idxmax')['same_categorisation'].value_counts(normalize=True))

idxmax          same_categorisation
Art             True                   471
                False                   86
Diplomacy       True                   419
                False                   18
Entertainment   True                   957
                False                   23
Sport           True                   176
                False                    5
Transportation  True                   201
                False                   83
Name: same_categorisation, dtype: int64
----------------------------------------------------
idxmax          same_categorisation
Art             True                   0.845601
                False                  0.154399
Diplomacy       True                   0.958810
                False                  0.041190
Entertainment   True                   0.976531
                False                  0.023469
Sport           True                   0.972376
                False                  0.027624
Transportation  True 

### 5. Aggregate paragraphs Classification into City Pair Classification

#### 5.1 Select right parameters

In [339]:
bottom_lda_threshold = 0.9
minimal_paragraph_length = 5

In [340]:
temp_df = updated_df[(updated_df['city_pair'].isin(updated_df['city_pair'].unique()[:])) & (updated_df['max'] > bottom_lda_threshold) & (updated_df['merged_POS'].str.len() > minimal_paragraph_length)]

In [342]:
# temp_df[temp_df['idxmax'] != 'Education']['same_categorisation'].value_counts(normalize=True)

In [343]:
grouped_df = temp_df.groupby('city_pair')

In [344]:
nested_list2 = [['city_pair', 'paragraphs', 'lemmatised_paragraph_length', 'same_categorisation_raw', 'same_categorisation_percentage',
                'lda_dominant_category', 'embedding_dominant_category', 'lda_art', 'embedding_art', 'lda_diplomacy',
                'embedding_diplomacy', 'lda_entertainment', 'embedding_entertainment', 
                'lda_sport', 'embedding_sport', 'lda_transportation', 'embedding_transportation']]

In [345]:
from IPython.display import display

categories = topics
for city_pair, sub_df in tqdm(grouped_df):
    paragraph_count = sub_df['paragraph'].count()
    same_categorisation_raw = sub_df['same_categorisation'].sum()
    same_categorisation_percentage = sub_df['same_categorisation'].sum()/sub_df['same_categorisation'].count()
    lemmatised_paragraph_len = sub_df['merged_POS'].str.len().mean()
    
    lda_prediction = sub_df['idxmax'].value_counts()
    embedding_prediction = sub_df['outcome'].value_counts()
        
    lda_dominant_category = lda_prediction.idxmax()
    embedding_dominant_category = embedding_prediction.idxmax()
    
    lda_prediction = lda_prediction.to_dict()
    embedding_prediction = embedding_prediction.to_dict()
    
    if (len(lda_prediction) != 6) or (len(embedding_prediction) != 6):
        for category in categories:
            if category.capitalize() not in lda_prediction.keys():
                lda_prediction[category.capitalize()] = 0
            if category not in embedding_prediction.keys():
                embedding_prediction[category] = 0
                
    temp_l2 = [city_pair, paragraph_count, lemmatised_paragraph_len, same_categorisation_raw, same_categorisation_percentage,
                lda_dominant_category, embedding_dominant_category, lda_prediction['Art'], embedding_prediction['art'], lda_prediction['Diplomacy'],
                embedding_prediction['diplomacy'],
                lda_prediction['Entertainment'], embedding_prediction['entertainment'], lda_prediction['Sport'],
                embedding_prediction['sport'], lda_prediction['Transportation'], embedding_prediction['transportation']]
    if len(nested_list2[0]) != len(temp_l2):
        raise Exception('Not the same size!')
    
    nested_list2.append(temp_l2)

  0%|          | 0/2 [00:00<?, ?it/s]

In [374]:
final_df = pd.DataFrame(nested_list2[1:],columns=nested_list2[0])

# Normalize category outcomes
final_df[list(final_df.columns)[7:]].div(final_df['paragraphs'], axis=0) # .count()

Unnamed: 0,lda_art,embedding_art,lda_diplomacy,embedding_diplomacy,lda_entertainment,embedding_entertainment,lda_sport,embedding_sport,lda_transportation,embedding_transportation
0,0.192771,0.174699,0.106426,0.134538,0.47992,0.481928,0.068273,0.090361,0.15261,0.118474
1,0.237506,0.200412,0.197836,0.235446,0.381762,0.385368,0.075734,0.091705,0.107161,0.087069


In [375]:
final_df

Unnamed: 0,city_pair,paragraphs,lemmatised_paragraph_length,same_categorisation_raw,same_categorisation_percentage,lda_dominant_category,embedding_dominant_category,lda_art,embedding_art,lda_diplomacy,embedding_diplomacy,lda_entertainment,embedding_entertainment,lda_sport,embedding_sport,lda_transportation,embedding_transportation
0,berlin_milan,498,29.405622,458,0.919679,Entertainment,entertainment,96,87,53,67,239,240,34,45,76,59
1,london_berlin,1941,32.223596,1766,0.90984,Entertainment,entertainment,461,389,384,457,741,748,147,178,208,169
