### imports

In [33]:
from preprocessing_functions import *
from word_embedding_functions import *

import pandas as pd
import numpy as np
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

- Clean and preprocess (lemmatise) a list of documents (e.g. paragraphs)
- Get your topics through unsupervised clustering with (LDA Topic Modeling)
- Use these for the word embedding algorithm

In [27]:
# Removing words with double meaning helps (e.g. bear, degree)

diplomacy_words = ['War', 'Embassy', 'Army', 'Diplomatic', 'Ambassador', 'Treaty', 'Protest', 'Force', 'Mission', 'Arrest', 'Government', 'Police', 'Attack', 'Party', 'Minister']
entertainment_words = ['Opera', 'Festival', 'Perform', 'Orchestra', 'Symphony', 'Concert', 'Music', 'Film', 'Sing', 'Theatre', 'Performance', 'Role', 'Premiere', 'Tour', 'Band']
art_words = ['Exhibition', 'Art', 'Museum', 'Gallery', 'Exhibit', 'Painting', 'Collection', 'Paint', 'Portrait', 'Artist', 'Sculpture', 'Fashion', 'Design', 'Contemporary', 'Painter']
# education_words = ['Study', 'School', 'Professor', 'University', 'Graduate', 'Educate', 'Lecture', 'Research', 'College', 'Teach', 'Science', 'Education', 'Philosophy', 'Doctorate', 'Faculty'] # replaced bear with educate and degree with Lecture
transportation_words = ['Railway', 'Route', 'Line', 'Operate', 'Flight', 'Station', 'Service', 'Airline', 'Airport', 'Train', 'Passenger', 'Speed', 'Aircraft', 'Rail', 'Network']
sport_words = ['Final', 'Win', 'Team', 'Match', 'Game', 'Goal', 'Club', 'League', 'Champion', 'Championship', 'Season', 'Score', 'Round', 'Tournament', 'Football']

lda_topic_words = {'diplomacy': diplomacy_words,
                   'entertainment': entertainment_words,
                   'art': art_words,
                   # 'education': education_words,
                   'transportation': transportation_words,
                   'sport': sport_words}

### 1. Loading GloVe Word Embedding

In [34]:
%%time

embeddings_dict = load_glove_word_embeddings(GLOVE_PATH="../../../../../glove.840B.300d.txt")

This will take approximately ~ 4 minutes...


  0%|          | 0/2196017 [00:00<?, ?it/s]

. ['.', '.', '-0.1573', '-0.29517']
at ['name@domain.com', '0.0061218', '0.39595', '-0.22079']
. ['.', '.', '.', '.']
to ['name@domain.com', '0.33865', '0.12698', '-0.16885']
. ['.', '0.035974', '-0.024421', '0.71402']
. ['.', '.', '.', '0.033459']
email ['name@domain.com', '0.33529', '0.32949', '0.2646']
or ['name@domain.com', '0.48374', '0.49669', '-0.25089']
contact ['name@domain.com', '0.016426', '0.13728', '0.18781']
Email ['name@domain.com', '0.37344', '0.024573', '-0.12583']
on ['name@domain.com', '0.037295', '-0.15381', '-0.045189']
At ['Killerseats.com', '-0.13854', '-0.01706', '-0.13651']
by ['name@domain.com', '0.6882', '-0.36436', '0.62079']
in ['mylot.com', '-0.18148', '0.47096', '0.32916']
emailing ['name@domain.com', '0.39173', '-0.39132', '-0.4266']
Contact ['name@domain.com', '0.14933', '-0.28605', '0.3444']
at ['name@domain.com', '0.44321', '-0.40005', '-0.20065']
• ['name@domain.com', '-0.13288', '-0.31383', '-0.032356']
at ['Amazon.com', '-0.5275', '-0.73685', '0.10

### 2. Get Mean of the words from each category

In [38]:
mean_vectors_dict = {}
for topic in lda_topic_words:
    words = lda_topic_words[topic]
    words = [word for word in words if word in embeddings_dict.keys()] # checks if word is in vocabulary (i.e. has been seen by the model before)
    mean_embedding = np.mean([embeddings_dict[word] for word in words], axis=0)
    mean_vectors_dict[topic] = mean_embedding

### 3. Load Documents

In [54]:
from ast import literal_eval

df = pd.read_csv("../../../../../data/clean/classified_435_citypairs_311k_paragraphs.csv")
df['merged_POS'] = df['merged_POS'].apply(literal_eval)

In [70]:
sample = df[(df['idxmax'] == 'Sport') & (df['max']>0.9)].iloc[:500]

In [73]:
def categorize_text(lemmatized_wordlist, mean_vectors_dict, keywords, embeddings_dict, bottom_threshold=0.1, verbose1=False, verbose2=False):
    """
    --> Function that loads glove word embeddings.

        Parameters:
        -----------
            lemmatized_words: List -> List of words.
            keywords: nested list -> List of lists of keywords that represent categories.
            number_of_keywords: Int (default = 1) -> number of keywords to use from a category (setting it to 0 will use them all!)
            bottom_threshold: Float (default = 0.1) -> Lowest allowed similarity value between a word and dominant category.
            verbose1: Bool (default = False) -> Shows similarity calculations between a word and each keyword.
            verbose2: Bool (default = False) -> Shows similarity calculations between a word and each category.

    """
    similar_categories = []
    
    for word in lemmatized_wordlist:
        try:
            word_vector = embeddings_dict[word]
        except:
            continue
            
        if verbose1 or verbose2:
            print(f"word: \t\t'{word}'")

        closeness = []

        for category in keywords:
            try:
                keyword_vector = mean_vectors_dict[category]
            except:
                keyword_vector = embeddings_dict[category]
                
            similarity = 1 - cosine(keyword_vector, word_vector)

            closeness.append((similarity, category))

            if verbose1:
                # print('___________________________')
                print('===>', '\t\tcategory:', category, '\n\t\tsimilarity:', similarity, f"\n")


        similar_category = max(closeness)

        sortedcat = sorted(closeness, key=lambda item: item[0], reverse=True)
        if (sortedcat[0][0] - 0.05) > sortedcat[2][0]:
            allowed = True
        else:
            allowed = False



        if similar_category[0] > bottom_threshold and allowed:
            similar_categories.append((word, similar_category))
            if verbose2:
                #print('category similarity:')
                # pprint(sorted(closeness, key=lambda x: x[0], reverse=True))
                print(f"choice: \tkept")
                print(f"\n==> \tcategory:, {similar_category[1]}, \n\tsimilarity score: {similar_category[0]}")
        elif verbose2:
            print(f"choice: \tdiscarded")
            reason = 'ambiguity' if not allowed else 'low similarity score'
            print(f"reasoning: \t{reason}")

        if verbose2:
            # print(f"analysis: {'not' if not allowed else ''} enough difference\nscores:")
            print("\nscores:\n\t----category----          ----score----")
            for i in sortedcat:
                print(f"\t{i[1]:<10s} \t\t{i[0]}")
            print()

            print('='*100)
            print('')
    
    categories_dict = {key: 0 for key in keywords}
#     print(similar_categories)
#     print(categories_dict)
    for x in similar_categories:
        categories_dict[x[1][1]] += x[1][0] #print(x[1])
    

    nonsorted_results = list(sorted(categories_dict.items(), key=lambda item: item[0], reverse=False))
    results = list(sorted(categories_dict.items(), key=lambda item: item[1], reverse=True))
    #pprint(results)

    # print(f"\nThe dominant category is: '{results[0][0]}'", end='')
    #if (results[0][1] - (float(results[0][1])/5)) <= results[1][1]:  
    #    print(f", closely followed by: '{results[1][0]}'.")
    if verbose2:
        print('\n')
        pprint(similar_categories)
    # print('\n --------------------------------------------------------------------')
    
    prediction_dict = {'category_similarities': nonsorted_results, 'prediction': results[0][0]} 

    return prediction_dict

### 3. Classify Paragraphs (by Word Embedding Algorithm)

#### 3.1 Select right parameters

In [90]:
bottom_threshold = 0.25
verbose1 = False
verbose2 = False

In [93]:
%time

topics = [key for key in list(sorted(mean_vectors_dict.keys()))]
nested_l = [['index']+topics+['outcome']]

for idx, row in tqdm(sample['merged_POS'].iteritems(), total=len(sample['merged_POS'])):
    output = categorize_text(lemmatized_wordlist=row, mean_vectors_dict=mean_vectors_dict, keywords=topics, embeddings_dict=embeddings_dict, bottom_threshold=bottom_threshold, verbose1=verbose1, verbose2=verbose2)
    
    temp_l = [idx] +[result[1] for result in output['category_similarities']] + [output['prediction']]
    
    if len(nested_l[0]) != len(temp_l):
        raise Exception('Not the same size!')
    nested_l.append(temp_l)


prediction_df = pd.DataFrame(nested_l[1:],columns=nested_l[0]).set_index('index')

CPU times: total: 0 ns
Wall time: 0 ns


  0%|          | 0/500 [00:00<?, ?it/s]

In [99]:
prediction_df.head(5)

Unnamed: 0_level_0,art,diplomacy,entertainment,sport,transportation,outcome
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
38,0.0,0.270101,0.402315,7.37086,0.0,sport
52,0.0,1.333066,0.0,7.911461,1.976946,sport
61,0.0,0.0,0.0,4.033192,0.0,sport
87,0.0,0.0,0.0,1.073657,0.0,sport
93,0.0,0.0,0.0,2.194695,0.0,sport


### 4. Insert classification into dataset

In [115]:
updated_df = sample.join(prediction_df)
updated_df.head(2)
updated_df['same_categorisation'] = (updated_df['idxmax'].str.lower() == updated_df['outcome'])

### 5. Aggregate paragraphs Classification into City Pair Classification

#### 5.1 Select right parameters

In [101]:
bottom_lda_threshold = 0.9
minimal_paragraph_length = 5

In [119]:
temp_df = updated_df[(updated_df['city_pair'].isin(updated_df['city_pair'].unique()[:])) & (updated_df['max'] > bottom_lda_threshold) & (updated_df['merged_POS'].str.len() > minimal_paragraph_length)]

In [120]:
grouped_df = temp_df.groupby('city_pair')

In [124]:
nested_list2 = [['city_pair', 'paragraphs', 'lemmatised_paragraph_length', 'same_categorisation_raw', 'same_categorisation_percentage',
                'lda_dominant_category', 'embedding_dominant_category', 'lda_art', 'embedding_art', 'lda_diplomacy',
                'embedding_diplomacy', 'lda_entertainment', 'embedding_entertainment', 
                'lda_sport', 'embedding_sport', 'lda_transportation', 'embedding_transportation']]

In [122]:
sub_df

Unnamed: 0,paragraph_id,city_pair,paragraph,merged_POS,Sport,Art,Diplomacy,Entertainment,Transportation,Education,idxmax,max,art,diplomacy,entertainment,sport,transportation,outcome
38,39,berlin_milan,she received first public attention while stil...,"[attention, year, best, bronze, medal, winner,...",0.956612,0.001135,0.020074,0.019908,0.001004,0.001268,Sport,0.956612,0.0,0.270101,0.402315,7.37086,0.0,sport
52,53,berlin_milan,green's prominence grew in 1930 as he set a ne...,"[prominence, world, record, km, champion, stri...",0.952881,0.000861,0.001007,0.000881,0.043408,0.000962,Sport,0.952881,0.0,1.333066,0.0,7.911461,1.976946,sport
61,62,berlin_milan,"simon biwott (born 3 march 1970 in eldoret, ua...","[march, distance, runner, medal, man, marathon...",0.903493,0.001763,0.002063,0.001805,0.001559,0.089317,Sport,0.903493,0.0,0.0,0.0,4.033192,0.0,sport
87,88,berlin_milan,"in 1920, he tied for 2-4th in canterbury, took...","[canterbury, open, tie, take, tie, tie, tie, t...",0.978071,0.004221,0.004939,0.004321,0.003732,0.004715,Sport,0.978071,0.0,0.0,0.0,1.073657,0.0,sport
93,94,berlin_milan,as the 1969 dutch champions feijenoord partici...,"[champion, aggregate, round, team, leg, defici...",0.984538,0.002976,0.003483,0.003047,0.002632,0.003325,Sport,0.984538,0.0,0.0,0.0,2.194695,0.0,sport
98,99,berlin_milan,following league meetings in providence in lat...,"[league, meeting, providence, franchise, lineu...",0.987079,0.002487,0.00291,0.002546,0.002199,0.002779,Sport,0.987079,0.0,0.345374,0.663337,2.163617,0.0,sport
108,109,berlin_milan,"in 1999–2000, hertha were berlin's first-ever ...","[representative, group, stage, expense, group,...",0.972273,0.005337,0.006245,0.005463,0.004719,0.005962,Sport,0.972273,0.0,0.315745,0.698827,0.881382,0.0,sport
118,119,berlin_milan,"sometime in 1903, poddubny joined the saint pe...","[club, world, championship, world, champion, w...",0.985263,0.002837,0.003319,0.002904,0.002508,0.003169,Sport,0.985263,0.0,0.0,0.396422,3.788687,0.0,sport
122,123,berlin_milan,"following the contract termination, several cl...","[contract, termination, club, interest, follow...",0.962306,0.007255,0.00849,0.007427,0.006416,0.008106,Sport,0.962306,0.0,0.0,0.295362,0.326886,0.0,sport
128,129,berlin_milan,"in 1902, he tied for 16–19th in hannover (13th...","[barman, tourn, match, tie, win, tie, win, sha...",0.981864,0.003491,0.004085,0.003574,0.003087,0.0039,Sport,0.981864,0.0,0.0,0.0,4.212601,0.0,sport


In [126]:
from IPython.display import display

categories = topics
for city_pair, sub_df in tqdm(grouped_df):
    paragraph_count = sub_df['paragraph'].count()
    same_categorisation_raw = sub_df['same_categorisation'].sum()
    same_categorisation_percentage = sub_df['same_categorisation'].sum()/sub_df['same_categorisation'].count()
    lemmatised_paragraph_len = sub_df['merged_POS'].str.len().mean()
    
    lda_prediction = sub_df['idxmax'].value_counts()
    embedding_prediction = sub_df['outcome'].value_counts()
        
    lda_dominant_category = lda_prediction.idxmax()
    embedding_dominant_category = embedding_prediction.idxmax()
    
    lda_prediction = lda_prediction.to_dict()
    embedding_prediction = embedding_prediction.to_dict()
   
    
    if (len(lda_prediction) != 6) or (len(embedding_prediction) != 6):
        for category in categories:
            if category.capitalize() not in lda_prediction.keys():
                lda_prediction[category.capitalize()] = 0
            if category not in embedding_prediction.keys():
                embedding_prediction[category] = 0
                
#     # print(y['idxmax'].value_counts().idxmax(), y['idxmax'].value_counts().max())
#     # print(y['idxmax'].value_counts())
   
    
#     #print(z)
    temp_l2 = [city_pair, paragraph_count, lemmatised_paragraph_len, same_categorisation_raw, same_categorisation_percentage,
                lda_dominant_category, embedding_dominant_category, lda_prediction['Art'], embedding_prediction['art'], lda_prediction['Diplomacy'],
                embedding_prediction['diplomacy'],
                lda_prediction['Entertainment'], embedding_prediction['entertainment'], lda_prediction['Sport'],
                embedding_prediction['sport'], lda_prediction['Transportation'], embedding_prediction['transportation']]
    if len(nested_list2[0]) != len(temp_l2):
        raise Exception('Not the same size!')
    
    nested_list2.append(temp_l2)
    #print(len(templist))
    #print(len(nested_list[0]))
    # display(y) #.apply(display) #['Sport'].value_counts()

  0%|          | 0/6 [00:00<?, ?it/s]

In [130]:
final_df = pd.DataFrame(nested_list2[1:],columns=nested_list2[0])
final_df#['embedding_dominant_category']

Unnamed: 0,city_pair,paragraphs,lemmatised_paragraph_length,same_categorisation_raw,same_categorisation_percentage,lda_dominant_category,embedding_dominant_category,lda_art,embedding_art,lda_diplomacy,embedding_diplomacy,lda_entertainment,embedding_entertainment,lda_sport,embedding_sport,lda_transportation,embedding_transportation
0,berlin_milan,34,29.235294,34,1.0,Sport,sport,0,0,0,0,0,0,34,34,0,0
1,london_berlin,188,34.87234,181,0.962766,Sport,sport,0,3,0,3,0,1,188,181,0,0
2,london_madrid,119,42.352941,117,0.983193,Sport,sport,0,0,0,0,0,2,119,117,0,0
3,london_milan,54,35.574074,53,0.981481,Sport,sport,0,1,0,0,0,0,54,53,0,0
4,madrid_berlin,61,39.606557,60,0.983607,Sport,sport,0,0,0,1,0,0,61,60,0,0
5,madrid_milan,34,47.823529,33,0.970588,Sport,sport,0,0,0,0,0,1,34,33,0,0
