<div style="background-color: lightgreen; border-radius: 5px; padding: 10px;">
    <h4>Word Embedding Categorisation</h4>
    <p>...</p>
</div>

### imports

In [1]:
from preprocessing_functions import *
from word_embedding_functions import *

import pandas as pd
import numpy as np
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

- Clean and preprocess (lemmatise) a list of documents (e.g. paragraphs)
- Get your topics through unsupervised clustering with (LDA Topic Modeling)
- Use these for the word embedding algorithm

### 1. Set Topic Words

In [7]:
# Removing words with double meaning helps (e.g. bear, degree)

#fashion_words = ['fashion', 'model', 'vogue', 'store', 'designer', 'couture', 'catwalk', 'runway', 'modeling', 'clothing', 'cosmetic', 'brand', 'retail', 'advertising', 'perfume']
diplomacy_words = ['War', 'Embassy', 'Army', 'Diplomatic', 'Ambassador', 'Treaty', 'Protest', 'Force', 'Mission', 'Arrest', 'Government', 'Police', 'Attack', 'Party', 'Minister']
entertainment_words = ['Opera', 'Festival', 'Perform', 'Orchestra', 'Symphony', 'Concert', 'Music', 'Film', 'Sing', 'Theatre', 'Performance', 'Role', 'Premiere', 'Tour', 'Band']
art_words = ['Exhibition', 'Art', 'Museum', 'Gallery', 'Exhibit', 'Painting', 'Collection', 'Paint', 'Portrait', 'Artist', 'Sculpture', 'Fashion', 'Design', 'Contemporary', 'Painter']
education_words = ['Study', 'School', 'Professor', 'University', 'Graduate', 'Educate', 'Lecture', 'Research', 'College', 'Teach', 'Science', 'Education', 'Philosophy', 'Doctorate', 'Faculty'] # replaced bear with educate and degree with Lecture
transportation_words = ['Railway', 'Route', 'Line', 'Operate', 'Flight', 'Station', 'Service', 'Airline', 'Airport', 'Train', 'Passenger', 'Speed', 'Aircraft', 'Rail', 'Network']
sport_words = ['Final', 'Win', 'Team', 'Match', 'Game', 'Goal', 'Club', 'League', 'Champion', 'Championship', 'Season', 'Score', 'Round', 'Tournament', 'Football']

lda_topic_words = { #'fashion': fashion_words,
                    'diplomacy': diplomacy_words,
                   'entertainment': entertainment_words,
                   'art': art_words,
                   # 'education': education_words,
                   'transportation': transportation_words,
                   'sport': sport_words}

### 2. Loading GloVe Word Embedding

In [5]:
%%time

embeddings_dict, discarded_dict = load_glove_word_embeddings(GLOVE_PATH="../../../../../glove.840B.300d.txt")

This will take approximately ~ 4 minutes...


  0%|          | 0/2196017 [00:00<?, ?it/s]

CPU times: total: 3min 2s
Wall time: 3min 11s


### 3. Get Mean of the words from each Topic

In [71]:
mean_vectors_dict = {}
for topic in lda_topic_words:
    words = lda_topic_words[topic]
    words = [word for word in words if word in embeddings_dict.keys()] # checks if word is in vocabulary (i.e. has been seen by the model before)
    mean_embedding = np.mean([embeddings_dict[word.lower()] for word in words], axis=0)
    mean_vectors_dict[topic] = mean_embedding

### 4. Load Documents

In [16]:
from ast import literal_eval

# df = pd.read_csv("../../../../../data/clean/classified_435_citypairs_311k_paragraphs.csv")
df = pd.read_csv("../../../../../data/clean/classified_50cities_740citypairs_400k_paragraphs.csv")
df['merged_POS'] = df['merged_POS'].apply(literal_eval)

In [40]:
from ast import literal_eval

df2 = pd.read_csv("../../../../../data/clean/classified_435_citypairs_311k_paragraphs.csv")
df2['merged_POS'] = df2['merged_POS'].apply(literal_eval)

In [42]:
df.columns

Index(['paragraph_id', 'city_pair', 'paragraph', 'merged_POS', 'lda_sport',
       'lda_art', 'lda_entertainment', 'lda_diplomacy', 'lda_transportation',
       'lda_education', 'lda_dominant', 'lda_dominant_score'],
      dtype='object')

### Select number of paragraphs to process

In [65]:
sample = df2[:20000]

### 5. Classify Paragraphs (by Word Embedding Algorithm)

#### 5.1 Select right parameters

In [66]:
bottom_threshold = 0.20
verbose1 = False
verbose2 = False

In [69]:
%time

topics = [key for key in list(sorted(mean_vectors_dict.keys()))]
nested_l = [['index']+topics+['embedding_dominant']]

for idx, row in tqdm(sample['merged_POS'].iteritems(), total=len(sample['merged_POS'])):
    output = categorize_text(lemmatized_wordlist=row, mean_vectors_dict=mean_vectors_dict, keywords=topics, embeddings_dict=embeddings_dict, bottom_threshold=bottom_threshold, verbose1=verbose1, verbose2=verbose2)
    
    temp_l = [idx] +[result[1] for result in output['category_similarities']] + [output['prediction']]
    
    #print(row, output['category_similarities'])
    
    if len(nested_l[0]) != len(temp_l):
        raise Exception('Not the same size!')
        
    nested_l.append(temp_l)


prediction_df = pd.DataFrame(nested_l[1:],columns=nested_l[0]).set_index('index')

CPU times: total: 0 ns
Wall time: 0 ns


  0%|          | 0/20000 [00:00<?, ?it/s]

In [27]:
prediction_df

Unnamed: 0_level_0,art,diplomacy,entertainment,sport,transportation,embedding_dominant
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.000000,2.326078,0.301628,0.768217,0.609732,diplomacy
1,0.265830,1.243361,0.322190,0.425173,0.000000,diplomacy
2,0.000000,0.893548,0.000000,0.217850,3.286044,transportation
3,0.751220,0.405813,0.301628,0.752151,0.000000,sport
4,0.000000,0.000000,0.000000,0.000000,0.335578,transportation
...,...,...,...,...,...,...
4995,0.564840,2.052759,0.235481,0.342959,0.000000,diplomacy
4996,3.496995,0.232795,0.903156,0.000000,0.207021,art
4997,0.681283,2.127343,1.813117,0.498108,0.253583,diplomacy
4998,1.605937,1.992433,2.456318,0.893511,0.211430,entertainment


### 4. Insert classification into dataset

In [37]:
updated_df = sample.join(prediction_df)
updated_df.head(2)
updated_df['same_categorisation'] = updated_df.apply(lambda x: x.lda_dominant.endswith(x.embedding_dominant), axis=1) # (updated_df['embedding_dominant'].isin('lda_dominant') 'lda_dominant'].str.contains() == updated_df['outcome'])

In [50]:
updated_df2 = sample.join(prediction_df)
updated_df2.head(2)
updated_df2['same_categorisation'] = (updated_df2['idxmax'].str.lower() == updated_df2['embedding_dominant'])

#### Similarity between lda topic model and word embedding algorithm output

In [52]:
updated_df['same_categorisation'].sum()

3216

In [329]:
# Number of lemmatised words in the paragraphs
updated_df[(updated_df['max'] > 0.9)]['merged_POS'].str.len().value_counts(bins=[0, 10, 20, 30, 40])

(10.0, 20.0]      611
(20.0, 30.0]      544
(30.0, 40.0]      426
(-0.001, 10.0]    338
Name: merged_POS, dtype: int64

In [61]:
print(updated_df2[(updated_df2['max'] > 0.8) & (updated_df2['merged_POS'].str.len() > 10)].groupby('idxmax')['same_categorisation'].value_counts())
print('----------------------------------------------------')
print(updated_df2[(updated_df2['max'] > 0.8) & (updated_df2['merged_POS'].str.len() > 10)].groupby('idxmax')['same_categorisation'].value_counts(normalize=True))

idxmax          same_categorisation
Art             True                   408
                False                   79
Diplomacy       True                   448
                False                   20
Education       False                  378
Entertainment   True                   948
                False                   18
Sport           True                   168
                False                    1
Transportation  True                   193
                False                   75
Name: same_categorisation, dtype: int64
----------------------------------------------------
idxmax          same_categorisation
Art             True                   0.837782
                False                  0.162218
Diplomacy       True                   0.957265
                False                  0.042735
Education       False                  1.000000
Entertainment   True                   0.981366
                False                  0.018634
Sport           True      

In [60]:
print(updated_df[(updated_df['lda_dominant_score'] > 0.8) & (updated_df['merged_POS'].str.len() > 10)].groupby('lda_dominant')['same_categorisation'].value_counts())
print('----------------------------------------------------')
print(updated_df[(updated_df['lda_dominant_score'] > 0.8) & (updated_df['merged_POS'].str.len() > 10)].groupby('lda_dominant')['same_categorisation'].value_counts(normalize=True))

lda_dominant        same_categorisation
lda_art             True                   227
                    False                  201
lda_diplomacy       True                   409
                    False                    7
lda_education       False                  413
lda_entertainment   True                   979
                    False                   20
lda_sport           True                   173
                    False                    2
lda_transportation  True                   192
                    False                   68
Name: same_categorisation, dtype: int64
----------------------------------------------------
lda_dominant        same_categorisation
lda_art             True                   0.530374
                    False                  0.469626
lda_diplomacy       True                   0.983173
                    False                  0.016827
lda_education       False                  1.000000
lda_entertainment   True                   0.97998

In [68]:
threshold = 0.7

print(updated_df[(updated_df['lda_dominant'] == 'lda_art') & (updated_df['lda_dominant_score'] > threshold)]['embedding_dominant'].value_counts())
print()
print(updated_df[(updated_df['lda_dominant'] == 'lda_art') & (updated_df['lda_dominant_score'] > threshold)]['embedding_dominant'].value_counts(normalize=True))

art               349
diplomacy         176
entertainment      47
sport              46
transportation     14
Name: embedding_dominant, dtype: int64

art               0.552215
diplomacy         0.278481
entertainment     0.074367
sport             0.072785
transportation    0.022152
Name: embedding_dominant, dtype: float64


### 5. Aggregate paragraphs Classification into City Pair Classification

#### 5.1 Select right parameters

In [339]:
bottom_lda_threshold = 0.9
minimal_paragraph_length = 5

In [340]:
temp_df = updated_df[(updated_df['city_pair'].isin(updated_df['city_pair'].unique()[:])) & (updated_df['max'] > bottom_lda_threshold) & (updated_df['merged_POS'].str.len() > minimal_paragraph_length)]

In [342]:
# temp_df[temp_df['idxmax'] != 'Education']['same_categorisation'].value_counts(normalize=True)

In [343]:
grouped_df = temp_df.groupby('city_pair')

In [344]:
nested_list2 = [['city_pair', 'paragraphs', 'lemmatised_paragraph_length', 'same_categorisation_raw', 'same_categorisation_percentage',
                'lda_dominant_category', 'embedding_dominant_category', 'lda_art', 'embedding_art', 'lda_diplomacy',
                'embedding_diplomacy', 'lda_entertainment', 'embedding_entertainment', 
                'lda_sport', 'embedding_sport', 'lda_transportation', 'embedding_transportation']]

In [345]:
from IPython.display import display

categories = topics
for city_pair, sub_df in tqdm(grouped_df):
    paragraph_count = sub_df['paragraph'].count()
    same_categorisation_raw = sub_df['same_categorisation'].sum()
    same_categorisation_percentage = sub_df['same_categorisation'].sum()/sub_df['same_categorisation'].count()
    lemmatised_paragraph_len = sub_df['merged_POS'].str.len().mean()
    
    lda_prediction = sub_df['idxmax'].value_counts()
    embedding_prediction = sub_df['outcome'].value_counts()
        
    lda_dominant_category = lda_prediction.idxmax()
    embedding_dominant_category = embedding_prediction.idxmax()
    
    lda_prediction = lda_prediction.to_dict()
    embedding_prediction = embedding_prediction.to_dict()
    
    if (len(lda_prediction) != 6) or (len(embedding_prediction) != 6):
        for category in categories:
            if category.capitalize() not in lda_prediction.keys():
                lda_prediction[category.capitalize()] = 0
            if category not in embedding_prediction.keys():
                embedding_prediction[category] = 0
                
    temp_l2 = [city_pair, paragraph_count, lemmatised_paragraph_len, same_categorisation_raw, same_categorisation_percentage,
                lda_dominant_category, embedding_dominant_category, lda_prediction['Art'], embedding_prediction['art'], lda_prediction['Diplomacy'],
                embedding_prediction['diplomacy'],
                lda_prediction['Entertainment'], embedding_prediction['entertainment'], lda_prediction['Sport'],
                embedding_prediction['sport'], lda_prediction['Transportation'], embedding_prediction['transportation']]
    if len(nested_list2[0]) != len(temp_l2):
        raise Exception('Not the same size!')
    
    nested_list2.append(temp_l2)

  0%|          | 0/2 [00:00<?, ?it/s]

In [374]:
final_df = pd.DataFrame(nested_list2[1:],columns=nested_list2[0])

# Normalize category outcomes
final_df[list(final_df.columns)[7:]].div(final_df['paragraphs'], axis=0) # .count()

Unnamed: 0,lda_art,embedding_art,lda_diplomacy,embedding_diplomacy,lda_entertainment,embedding_entertainment,lda_sport,embedding_sport,lda_transportation,embedding_transportation
0,0.192771,0.174699,0.106426,0.134538,0.47992,0.481928,0.068273,0.090361,0.15261,0.118474
1,0.237506,0.200412,0.197836,0.235446,0.381762,0.385368,0.075734,0.091705,0.107161,0.087069


In [375]:
final_df

Unnamed: 0,city_pair,paragraphs,lemmatised_paragraph_length,same_categorisation_raw,same_categorisation_percentage,lda_dominant_category,embedding_dominant_category,lda_art,embedding_art,lda_diplomacy,embedding_diplomacy,lda_entertainment,embedding_entertainment,lda_sport,embedding_sport,lda_transportation,embedding_transportation
0,berlin_milan,498,29.405622,458,0.919679,Entertainment,entertainment,96,87,53,67,239,240,34,45,76,59
1,london_berlin,1941,32.223596,1766,0.90984,Entertainment,entertainment,461,389,384,457,741,748,147,178,208,169


# Extras

#### Show closest words to topic vector

In [14]:
from scipy import spatial
from preprocessing_functions import *

def find_closest_embeddings(embedding, cutoff=25):
    return sorted(embeddings_dict.keys(), key=lambda token: spatial.distance.euclidean(embeddings_dict[token], embedding))

In [15]:
topic = 'art'

words = find_closest_embeddings(embedding=
     mean_vectors_dict[topic]    # embeddings_dict['diplomacy'] # embeddings_dict['fashion']
)[:2000]


In [None]:
english_words = get_english_words(path='../../../input/english_words_alpha_370k.txt
print(remove_non_existing_words_from_wordlist(words, english_words))