In [152]:
from preprocessing_functions import *
from word_embedding_functions import *

import pandas as pd
import numpy as np
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

### Categorize_text algorithm

In [147]:
def categorize_text(lemmatized_wordlist, mean_vectors_dict, keywords, embeddings_dict, number_of_keywords=1, bottom_threshold=0.1, verbose1=False, verbose2=False):
    """
    --> Function that loads glove word embeddings.

        Parameters:
        -----------
            lemmatized_words: List -> List of words.
            keywords: nested list -> List of lists of keywords that represent categories.
            number_of_keywords: Int (default = 1) -> number of keywords to use from a category (setting it to 0 will use them all!)
            bottom_threshold: Float (default = 0.1) -> Lowest allowed similarity value between a word and dominant category.
            verbose1: Bool (default = False) -> Shows similarity calculations between a word and each keyword.
            verbose2: Bool (default = False) -> Shows similarity calculations between a word and each category.

    """
    keywords_to_use = number_of_keywords if number_of_keywords else 1000
    similar_categories = []
    
    for word in lemmatized_wordlist:
        try:
            word_vector = embeddings_dict[word]
        except:
            continue
            
        if verbose1 or verbose2:
            print(f"word: \t\t'{word}'")

        closeness = []

        for category in keywords:
            summed_similarity = 0
            for keyword in category[:keywords_to_use]:
                try:
                    keyword_vector = mean_vectors_dict[keyword]
                    # print(keyword)
#                 if isinstance(keyword, np.ndarray):
#                     keyword = 'Diplomacy'
                    # keyword_vector = mean_embedding
                except:
                    keyword_vector = embeddings_dict[keyword]
                
                value = 1 - cosine(keyword_vector, word_vector)
                summed_similarity += value
            
            normalized_similarity = summed_similarity/len(category[:keywords_to_use])
            # print(type(normalized_similarity), normalized_similarity)
            closeness.append((normalized_similarity, category[0]))

            if verbose1:
                # print('___________________________')
                print('===>', '\t\tcategory:', category[0], '\n\t\tsimilarity:', normalized_similarity, f"\n\t\tkeywords: {category[:keywords_to_use]}\n")


        similar_category = max(closeness)

        sortedcat = sorted(closeness, key=lambda item: item[0], reverse=True)
        if (sortedcat[0][0] - 0.05) > sortedcat[2][0]:
            allowed = True
        else:
            allowed = False



        if similar_category[0] > bottom_threshold and allowed:
            similar_categories.append((word, similar_category))
            if verbose2:
                #print('category similarity:')
                # pprint(sorted(closeness, key=lambda x: x[0], reverse=True))
                print(f"choice: \tkept")
                print(f"\n==> \tcategory:, {similar_category[1]}, \n\tsimilarity score: {similar_category[0]}")
        elif verbose2:
            print(f"choice: \tdiscarded")
            reason = 'ambiguity' if not allowed else 'low similarity score'
            print(f"reasoning: \t{reason}")

        if verbose2:
            # print(f"analysis: {'not' if not allowed else ''} enough difference\nscores:")
            print("\nscores:\n\t----category----          ----score----")
            for i in sortedcat:
                print(f"\t{i[1]:<10s} \t\t{i[0]}")
            print()

            print('='*100)
            print('')
    
    categories_dict = {key[0]: 0 for key in descriptive_keywords}
    
    for x in similar_categories:
        categories_dict[x[1][1]] += x[1][0] #print(x[1])
    

    nonsorted_results = list(sorted(categories_dict.items(), key=lambda item: item[0], reverse=False))
    results = list(sorted(categories_dict.items(), key=lambda item: item[1], reverse=True))
    #pprint(results)

    # print(f"\nThe dominant category is: '{results[0][0]}'", end='')
    #if (results[0][1] - (float(results[0][1])/5)) <= results[1][1]:  
    #    print(f", closely followed by: '{results[1][0]}'.")
    if verbose2:
        print('\n')
        pprint(similar_categories)
    # print('\n --------------------------------------------------------------------')
    
    prediction_dict = {'category_similarities': nonsorted_results, 'prediction': results[0][0]} 

    return prediction_dict

### Old Keywords

In [148]:
conflict = ['war', 'fight', 'death', 'hostility', 'bomb', 'battle', 'nazi', 'army']
politics = ['politics', 'debate', 'election', 'government', 'democracy']
fashion = ['fashion', 'model', 'magazine', 'walk', 'glamour', 'outfit']
culture = ['culture', 'opera', "festival", 'collection', 'exhibition', 'art', 'museum']
sports = ['sport', 'medal', 'game', 'championship', 'club', 'score', 'play']
education = ['education', 'professor', 'study', 'research', 'university']

### New Keywords based on lda topics

In [150]:
s = """1 & Study           & Railway           & Final                    \\
2 & School          & Route             & Win                \\
3 & Professor       & Line              & Team               \\
4 & University      & Operate           & Match               \\
5 & Graduate        & Flight            & Game              \\
6 & Bear            & Station           & Goal          \\
7 & Degree          & Service           & Club           \\
8 & Research        & Airline           & League          \\
9 & College         & Airport           & Champion            \\
10 & Teach           & Train             & Championship       """

s2 = s.split('&')
s3 = [''.join([i for i in x if not i.isdigit()]) for x in s2]
s4 = [item.replace('\\\n', '') for item in s3]
s5 = [item.strip() for item in s4]
s5.remove(s5[0])
s5

l = []
for i in range(1, 30, 3):
    l.append(s5[i])
    # print(s5[i])
    
if len(l) == 10:
    print(l)

['Railway', 'Route', 'Line', 'Operate', 'Flight', 'Station', 'Service', 'Airline', 'Airport', 'Train']


In [82]:
diplomacy_words = ['War', 'Embassy', 'Army', 'Diplomatic', 'Ambassador', 'Treaty', 'Protest', 'Force', 'Mission', 'Arrest']
entertainment_words = ['Opera', 'Festival', 'Perform', 'Orchestra', 'Symphony', 'Concert', 'Music', 'Film', 'Sing', 'Theatre']
art_words = ['Exhibition', 'Art', 'Museum', 'Gallery', 'Exhibit', 'Painting', 'Collection', 'Paint', 'Portrait', 'Artist']
education_words = ['Study', 'School', 'Professor', 'University', 'Graduate', 'Bear', 'Degree', 'Research', 'College', 'Teach']
transportation_words = ['Railway', 'Route', 'Line', 'Operate', 'Flight', 'Station', 'Service', 'Airline', 'Airport', 'Train']
sport_words = ['Final', 'Win', 'Team', 'Match', 'Game', 'Goal', 'Club', 'League', 'Champion', 'Championship']

lda_topic_words = {'diplomacy': diplomacy_words,
                   'entertainment': entertainment_words,
                   'art': art_words,
                   'education': education_words,
                   'transportation': transportation_words,
                   'sport': sport_words}

In [14]:
# # Single keywords
# diplomacy = ['diplomacy']
# entertainment = ['entertainment']
# art = ['art']
# education = ['education']
# transportation = ['transportation']
# sport = ['sport']

# descriptive_keywords = [diplomacy, entertainment, art, education, transportation, sport]

### Get Mean of the words from each category

In [83]:
for x in lda_topic_words:
    print(x, lda_topic_words[x])

diplomacy ['War', 'Embassy', 'Army', 'Diplomatic', 'Ambassador', 'Treaty', 'Protest', 'Force', 'Mission', 'Arrest']
entertainment ['Opera', 'Festival', 'Perform', 'Orchestra', 'Symphony', 'Concert', 'Music', 'Film', 'Sing', 'Theatre']
art ['Exhibition', 'Art', 'Museum', 'Gallery', 'Exhibit', 'Painting', 'Collection', 'Paint', 'Portrait', 'Artist']
education ['Study', 'School', 'Professor', 'University', 'Graduate', 'Bear', 'Degree', 'Research', 'College', 'Teach']
transportation ['Railway', 'Route', 'Line', 'Operate', 'Flight', 'Station', 'Service', 'Airline', 'Airport', 'Train']
sport ['Final', 'Win', 'Team', 'Match', 'Game', 'Goal', 'Club', 'League', 'Champion', 'Championship']


In [71]:
### Word2vec instead of GloVe, should be transmutible though

import numpy as np

mean_vectors_dict = {}
for topic in lda_topic_words:
    words = lda_topic_words[topic]
    words = [word for word in words if word in embeddings_dict.keys()] # checks if word is in vocabulary (i.e. has been seen by the model before)
    mean_embedding = np.mean([embeddings_dict[word] for word in words], axis=0)
    mean_vectors_dict[topic] = mean_embedding

- Maybe i should do word1+ word2+ word3+ ...+ ...+ ... of the top 5 most relevant words of a topic to find the cluster center?
- Maybe get the mean embedding of multiple words? (from jveerbeek's DM page)

### Loading GloVe Word Embedding

In [153]:
%%time 

embeddings_dict = load_glove_word_embeddings(GLOVE_PATH="../../../../../glove.840B.300d.txt")

This will take approximately ~ 4 minutes...


  0%|          | 0/2196017 [00:00<?, ?it/s]

. ['.', '.', '-0.1573', '-0.29517']
at ['name@domain.com', '0.0061218', '0.39595', '-0.22079']
. ['.', '.', '.', '.']
to ['name@domain.com', '0.33865', '0.12698', '-0.16885']
. ['.', '0.035974', '-0.024421', '0.71402']
. ['.', '.', '.', '0.033459']
email ['name@domain.com', '0.33529', '0.32949', '0.2646']
or ['name@domain.com', '0.48374', '0.49669', '-0.25089']
contact ['name@domain.com', '0.016426', '0.13728', '0.18781']
Email ['name@domain.com', '0.37344', '0.024573', '-0.12583']
on ['name@domain.com', '0.037295', '-0.15381', '-0.045189']
At ['Killerseats.com', '-0.13854', '-0.01706', '-0.13651']
by ['name@domain.com', '0.6882', '-0.36436', '0.62079']
in ['mylot.com', '-0.18148', '0.47096', '0.32916']
emailing ['name@domain.com', '0.39173', '-0.39132', '-0.4266']
Contact ['name@domain.com', '0.14933', '-0.28605', '0.3444']
at ['name@domain.com', '0.44321', '-0.40005', '-0.20065']
• ['name@domain.com', '-0.13288', '-0.31383', '-0.032356']
at ['Amazon.com', '-0.5275', '-0.73685', '0.10

In [None]:
# INPUT_DIR = "../../../../../data/clean/city_pair_paragraphs3/"
# BATCHES = [5]
# POS = ["NOUN", "VERB", "ADJ"]
# ONLY_ENGLISH_WORDS = True
# sort_by_paragraphs_count = True
# merged_POS = True

# data_list = import_lemmatised_paragraphs(INPUT_DIR, POS, BATCHES, ONLY_ENGLISH_WORDS=ONLY_ENGLISH_WORDS, merged_POS=merged_POS, sort_by_paragraphs=sort_by_paragraphs_count)

In [None]:
# frames = [citypair['lemmatized_paragraphs'] for citypair in data_list]
# citypairs = [citypair['city_pair'] for citypair in data_list]

# result = pd.concat(frames) #, keys=citypairs)
# result.set_index('paragraph_id', inplace=True)
# result.sort_index(inplace=True)
# result.reset_index(inplace=True)
# result

### Convert type of "merged_POS" from string to list

In [240]:
from ast import literal_eval

df = pd.read_csv("classified_435_citypairs_311k_paragraphs.csv")
df['merged_POS'] = df['merged_POS'].apply(literal_eval)

In [261]:
# high_confidence_df = df[df['idxmax'] == 'Sport']  # df[(df['max'] > 0.8) & (df['max'] < 0.85) & (df['idxmax'] == 'Sport')]
# high_confidence_sample = high_confidence_df
# high_confidence_sample

Unnamed: 0,paragraph_id,city_pair,paragraph,merged_POS,Sport,Art,Diplomacy,Entertainment,Transportation,Education,idxmax,max
19,20,berlin_milan,"he signed with hc milan in 1991 and in 1992, c...","[game, year, country, flight, capital, year, s...",0.797217,0.003704,0.004335,0.003792,0.186813,0.004139,Sport,0.797217
38,39,berlin_milan,she received first public attention while stil...,"[attention, year, best, bronze, medal, winner,...",0.956612,0.001135,0.020074,0.019908,0.001004,0.001268,Sport,0.956612
52,53,berlin_milan,green's prominence grew in 1930 as he set a ne...,"[prominence, world, record, km, champion, stri...",0.952881,0.000861,0.001007,0.000881,0.043408,0.000962,Sport,0.952881
56,57,berlin_milan,in total he was the referee in 82 internationa...,"[total, referee, match, international, goal, g...",0.722303,0.001365,0.249650,0.001398,0.001207,0.024076,Sport,0.722303
61,62,berlin_milan,"simon biwott (born 3 march 1970 in eldoret, ua...","[march, distance, runner, medal, man, marathon...",0.903493,0.001763,0.002063,0.001805,0.001559,0.089317,Sport,0.903493
...,...,...,...,...,...,...,...,...,...,...,...,...
310810,312669,warsaw_valencia,it would take almost 20 years before elfsborg ...,"[year, cup, period, club, history, layer, cup,...",0.995711,0.000826,0.000966,0.000845,0.000730,0.000922,Sport,0.995711
310814,312673,warsaw_valencia,"on 16 september 2020, valencia returned to pol...","[loan, end, season, season, plethora, injury, ...",0.987588,0.002389,0.002796,0.002446,0.002112,0.002669,Sport,0.987588
310816,312675,warsaw_valencia,"in the 2000–01 season, the team finished third...","[season, team, polish, league, cup, quarterfin...",0.928442,0.001872,0.064023,0.001916,0.001655,0.002091,Sport,0.928442
310817,312676,warsaw_valencia,it would take almost 20 years before elfsborg ...,"[year, cup, period, club, history, layer, cup,...",0.995711,0.000826,0.000966,0.000845,0.000730,0.000922,Sport,0.995711


### Word Embedding Algoritm

In [112]:
# high_confidence_sample.merged_POS.iloc[2]

In [193]:
print(sum(predictions.values()))
print(predictions)
predictions['sport']/sum(predictions.values())

1465
{'sport': 1445, 'entertainment': 5, 'transportation': 1, 'education': 13, 'art': 1}


0.9863481228668942

In [199]:
# categories = ['art', 'diplomacy', 'education', 'entertainment', 'sport', 'transportation']
# for category in categories:
#     df[category] = None

In [201]:
# df['outcome'] = None
# df.head()

Unnamed: 0,paragraph_id,city_pair,paragraph,merged_POS,Sport,Art,Diplomacy,Entertainment,Transportation,Education,idxmax,max,diplomacy,entertainment,education,art,transportation,sport,outcome
0,1,berlin_milan,"after his tenure in academia, he continued to ...","[tenure, academia, month, year, travel, incide...",0.000765,0.001501,0.770121,0.001536,0.001327,0.22475,Diplomacy,0.770121,,,,,,,
1,2,berlin_milan,one of the astronomers selected for the search...,"[astronomer, search, priest, invitation, group...",0.195772,0.498009,0.302968,0.0011,0.00095,0.001201,Art,0.498009,,,,,,,
2,3,berlin_milan,there are plenty of air connections between ye...,"[plenty, air, connection, city, connection, ci...",0.000873,0.001713,0.002005,0.001754,0.991741,0.001914,Transportation,0.991741,,,,,,,
3,4,berlin_milan,"since 2009, 'the brandery', an urban fashion s...","[fashion, year, language, monitor, ranking, wo...",0.383249,0.602904,0.003862,0.003379,0.002918,0.003687,Art,0.602904,,,,,,,
4,5,berlin_milan,when considering the commuter belts or metropo...,"[commuter, belt, area, datum, population, orde...",0.002721,0.005337,0.094386,0.005463,0.88613,0.005962,Transportation,0.88613,,,,,,,


In [None]:
predictions = {}
i = 0
%time
l = [['index', 'art', 'diplomacy', 'education', 'entertainment', 'sport', 'transportation', 'outcome']]
for idx, row in tqdm(high_confidence_sample['merged_POS'].iteritems(), total=len(high_confidence_sample['merged_POS'])):
    output = categorize_text(lemmatized_wordlist=row, mean_vectors_dict=mean_vectors_dict, keywords=[['diplomacy'], ['entertainment'], ['education'], ['art'], ['transportation'], ['sport']], embeddings_dict=embeddings_dict, number_of_keywords=1, bottom_threshold=0.25, verbose1=False, verbose2=False)

    prediction = output['prediction']
    results = output['category_similarities']
    temp_l = [idx] +[result[1] for result in results] + [prediction]
    l.append(temp_l)
    
    
    # print(results)
#     for category, value in results:
#         row[category] = value
#         #print(category, value)
#     #print(results)
# #     # print(results)
# #     if False:
# #         if prediction != 'sport':
# #             if results[1] != 'sport':
# #                 print(results[0], results[1], row.paragraph)
# #     #         print(prediction, '\n', row.paragraph, '\n')

    if (results[0][1] - results[1][1]) > 0.10:
        if prediction not in predictions.keys():
            predictions[prediction] = 1
        else:
            predictions[prediction] +=1
            
df4 = pd.DataFrame(l[1:],columns=l[0]).set_index('index')

CPU times: total: 0 ns
Wall time: 0 ns


  0%|          | 0/32121 [00:00<?, ?it/s]

In [274]:
df2['outcome'].value_counts()
# 31218/df2['outcome'].count()

sport             31218
education           432
entertainment       222
transportation      139
diplomacy            82
art                  28
Name: outcome, dtype: int64

In [255]:
df3 = df.join(df2)

In [None]:
import pandas as pd

In [8]:
df = pd.read_csv("classified_435_citypairs_311k_paragraphs_both_methods.csv")

In [9]:
from ast import literal_eval

df['merged_POS'] = df['merged_POS'].apply(literal_eval)

In [None]:
# value_counts(bin=3)
# value_counts(bins=[0, 0.2, 0.6, 1])
# value_counts(normalize=True)

In [19]:
df[df['merged_POS'].str.len() == 0]

Unnamed: 0,paragraph_id,city_pair,paragraph,merged_POS,Sport,Art,Diplomacy,Entertainment,Transportation,Education,idxmax,max,art,diplomacy,education,entertainment,sport,transportation,outcome,same_categorisation
539,543,berlin_milan,"athens (greece), bangkok (thailand), berlin (g...",[],0.013162,0.025815,0.030210,0.026428,0.449186,0.455199,Education,0.455199,0.0,0.0,0.0,0.0,0.0,0.0,diplomacy,False
3835,3853,london_berlin,"35 – 'wall street journal' (us): baghdad, bang...",[],0.013162,0.452173,0.030210,0.452786,0.022827,0.028841,Entertainment,0.452786,0.0,0.0,0.0,0.0,0.0,0.0,diplomacy,False
3836,3854,london_berlin,"24 – 'new york times' (us): baghdad, beijing, ...",[],0.439520,0.452173,0.030210,0.026428,0.022827,0.028841,Art,0.452173,0.0,0.0,0.0,0.0,0.0,0.0,diplomacy,False
3837,3855,london_berlin,"17 – 'washington post' (us): baghdad, beijing,...",[],0.013162,0.025815,0.456568,0.026428,0.022827,0.455199,Diplomacy,0.456568,0.0,0.0,0.0,0.0,0.0,0.0,diplomacy,True
8322,8369,london_madrid,"barcelona is behind london, new york, paris, m...",[],0.013162,0.452173,0.030210,0.026428,0.022827,0.455199,Education,0.455199,0.0,0.0,0.0,0.0,0.0,0.0,diplomacy,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303346,305170,vienna_copenhagen,"united kingdom, malaysia, singapore, helsinki,...",[],0.013162,0.452173,0.456568,0.026428,0.022827,0.028841,Diplomacy,0.456568,0.0,0.0,0.0,0.0,0.0,0.0,diplomacy,True
304480,306310,vienna_frankfurt,"athens (greece), bangkok (thailand), berlin (g...",[],0.013162,0.025815,0.030210,0.026428,0.449186,0.455199,Education,0.455199,0.0,0.0,0.0,0.0,0.0,0.0,diplomacy,False
306397,308234,vienna_prague,"salzburg, vienna, bonn, helsinki, istanbul, at...",[],0.013162,0.452173,0.030210,0.452786,0.022827,0.028841,Entertainment,0.452786,0.0,0.0,0.0,0.0,0.0,0.0,diplomacy,False
306864,308707,vienna_prague,"vienna, czech republic (prague), germany (augs...",[],0.439520,0.025815,0.030210,0.026428,0.022827,0.455199,Education,0.455199,0.0,0.0,0.0,0.0,0.0,0.0,diplomacy,False


### Number of lemmatised words in paragraph

In [31]:
df['merged_POS'].str.len().value_counts(bins=[0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200]).sort_index().head(50)

(-0.001, 20.0]    102021
(20.0, 40.0]      117620
(40.0, 60.0]       55029
(60.0, 80.0]       21332
(80.0, 100.0]       8646
(100.0, 120.0]      3190
(120.0, 140.0]      1573
(140.0, 160.0]       670
(160.0, 180.0]       307
(180.0, 200.0]       169
Name: merged_POS, dtype: int64

### Paragraphs with dominant category over 0.5

In [165]:
df[df['max'] > 0.7]

Unnamed: 0,paragraph_id,city_pair,paragraph,merged_POS,Sport,Art,Diplomacy,Entertainment,Transportation,Education,idxmax,max,art,diplomacy,education,entertainment,sport,transportation,outcome,same_categorisation
0,1,berlin_milan,"after his tenure in academia, he continued to ...","[tenure, academia, month, year, travel, incide...",0.000765,0.001501,0.770121,0.001536,0.001327,0.224750,Diplomacy,0.770121,0.169194,1.928616,2.025882,0.309913,0.591317,1.040785,education,False
2,3,berlin_milan,there are plenty of air connections between ye...,"[plenty, air, connection, city, connection, ci...",0.000873,0.001713,0.002005,0.001754,0.991741,0.001914,Transportation,0.991741,0.000000,0.510214,0.640919,0.000000,0.477014,4.445736,transportation,True
4,5,berlin_milan,when considering the commuter belts or metropo...,"[commuter, belt, area, datum, population, orde...",0.002721,0.005337,0.094386,0.005463,0.886130,0.005962,Transportation,0.886130,0.000000,0.000000,0.371927,0.119219,0.000000,0.950457,transportation,True
5,6,berlin_milan,the eu contains about 40 urban areas with popu...,"[area, population, population, area, megacity,...",0.002721,0.005337,0.094386,0.005463,0.886130,0.005962,Transportation,0.886130,0.000000,0.000000,0.449584,0.000000,0.000000,0.915384,transportation,True
7,8,berlin_milan,"in the spring of 1875, lister along with agnes...","[spring, lister, sister, law, niece, group, we...",0.000601,0.040131,0.858321,0.040159,0.001043,0.059745,Diplomacy,0.858321,0.000000,0.190400,2.591743,1.868725,1.087525,1.019374,education,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310822,312681,warsaw_valencia,"as a concert singer, mikolaj had a broad reper...","[concert, singer, repertoire, work, symphony, ...",0.000985,0.193347,0.002260,0.799541,0.001708,0.002158,Entertainment,0.799541,0.578576,0.109004,2.216915,4.158394,0.000000,0.205921,entertainment,True
310823,312682,warsaw_valencia,"since 2010, edelmann has intensified his conce...","[concert, activity, guest, performance, recita...",0.001889,0.003704,0.004335,0.982658,0.003276,0.004139,Entertainment,0.982658,0.000000,0.099530,0.164835,3.578136,0.000000,0.000000,entertainment,True
310824,312683,warsaw_valencia,"born maria mokrzycka in lviv (now ukraine), sh...","[conservatory, debut, opera, opera, time, bari...",0.000729,0.001430,0.001673,0.993306,0.001264,0.001597,Entertainment,0.993306,0.064806,0.000000,2.403735,6.039672,0.170935,0.479676,entertainment,True
310825,312684,warsaw_valencia,ayala was elected a fellow of the american aca...,"[fellow, art, science, member, society, member...",0.001683,0.003300,0.003862,0.003379,0.002918,0.984857,Education,0.984857,1.120648,0.476482,4.250563,0.000000,0.000000,0.109702,education,True


In [59]:
df[(df['max'] > 0.5) & (df['merged_POS'].str.len() > 20)]['same_categorisation'].count()

195836

### 'Accuracy' (raw count)

In [162]:
#  & len(df['merged_POS'].str) > 20
df[(df['max'] > 0.5) & (df['merged_POS'].str.len() > 0.5)].groupby('idxmax')['same_categorisation'].value_counts()

idxmax          same_categorisation
Art             True                   30849
                False                  18734
Diplomacy       True                   33401
                False                  19951
Education       True                   46536
                False                   5001
Entertainment   True                   57046
                False                   5230
Sport           True                   30663
                False                    729
Transportation  True                   34930
                False                  10297
Name: same_categorisation, dtype: int64

### 'Accuracy' Normalized

In [168]:
df[(df['max'] > 0.9) & (df['merged_POS'].str.len() > 30)].groupby('idxmax')['same_categorisation'].value_counts(normalize=True)

idxmax          same_categorisation
Art             True                   0.814325
                False                  0.185675
Diplomacy       True                   0.846125
                False                  0.153875
Education       True                   0.994770
                False                  0.005230
Entertainment   True                   0.986964
                False                  0.013036
Sport           True                   0.999657
                False                  0.000343
Transportation  True                   0.912828
                False                  0.087172
Name: same_categorisation, dtype: float64

### Prediciton of LDA 'Art' topic paragraphs

In [134]:
print(df[(df['idxmax'] == 'Art') & (df['max'] > 0)]['outcome'].value_counts())
df[(df['idxmax'] == 'Art') & (df['max'] > 0.8)]['outcome'].value_counts()

art               31340
education         13171
entertainment      3100
sport              2009
transportation     1742
diplomacy          1359
Name: outcome, dtype: int64


art               22715
education          4923
sport               784
entertainment       723
transportation      651
diplomacy           425
Name: outcome, dtype: int64

In [131]:
print(df[(df['idxmax'] == 'Art') & (df['max'] > 0)]['outcome'].value_counts(normalize=True))
df[(df['idxmax'] == 'Art') & (df['max'] > 0.8)]['outcome'].value_counts(normalize=True)

art               0.594450
education         0.249825
entertainment     0.058800
sport             0.038106
transportation    0.033042
diplomacy         0.025777
Name: outcome, dtype: float64


art               0.751630
education         0.162900
sport             0.025942
entertainment     0.023924
transportation    0.021541
diplomacy         0.014063
Name: outcome, dtype: float64

### Prediction of LDA 'Diplomacy' topic paragraphs

In [145]:
print(df[(df['idxmax'] == 'Diplomacy') & (df['max'] > 0)]['outcome'].value_counts())
df[(df['idxmax'] == 'Diplomacy') & (df['max'] > 0.8)]['outcome'].value_counts()

diplomacy         34216
education         15184
transportation     3298
sport              2489
entertainment      1904
art                 737
Name: outcome, dtype: int64


diplomacy         26385
education          5467
transportation     1286
sport               871
entertainment       181
art                  26
Name: outcome, dtype: int64

In [144]:
print(df[(df['idxmax'] == 'Diplomacy') & (df['max'] > 0)]['outcome'].value_counts(normalize=True))
df[(df['idxmax'] == 'Diplomacy') & (df['max'] > 0.8)]['outcome'].value_counts(normalize=True)

diplomacy         0.591686
education         0.262572
transportation    0.057031
sport             0.043041
entertainment     0.032925
art               0.012745
Name: outcome, dtype: float64


diplomacy         0.771130
education         0.159779
transportation    0.037585
sport             0.025456
entertainment     0.005290
art               0.000760
Name: outcome, dtype: float64

### Prediction of LDA 'Education' topic paragraphs

In [128]:
print(df[(df['idxmax'] == 'Education') & (df['max'] > 0)]['outcome'].value_counts())
df[(df['idxmax'] == 'Education') & (df['max'] > 0.8)]['outcome'].value_counts()

education         49515
entertainment      2010
art                1820
diplomacy          1280
sport               623
transportation      536
Name: outcome, dtype: int64


education         28938
diplomacy           514
art                 415
entertainment       347
transportation      165
sport               140
Name: outcome, dtype: int64

In [127]:
print(df[(df['idxmax'] == 'Education') & (df['max'] > 0)]['outcome'].value_counts(normalize=True))
df[(df['idxmax'] == 'Education') & (df['max'] > 0.8)]['outcome'].value_counts(normalize=True)

education         0.887620
entertainment     0.036032
art               0.032626
diplomacy         0.022946
sport             0.011168
transportation    0.009608
Name: outcome, dtype: float64


education         0.948196
diplomacy         0.016842
art               0.013598
entertainment     0.011370
transportation    0.005406
sport             0.004587
Name: outcome, dtype: float64

### Prediction of LDA 'Education' topic paragraphs

In [126]:
print(df[(df['idxmax'] == 'Entertainment') & (df['max'] > 0)]['outcome'].value_counts())
df[(df['idxmax'] == 'Entertainment') & (df['max'] > 0.8)]['outcome'].value_counts()

entertainment     58958
education          3389
sport              2161
transportation      386
diplomacy           248
art                 224
Name: outcome, dtype: int64


entertainment     41350
sport               865
education           805
transportation      175
diplomacy            49
art                  48
Name: outcome, dtype: int64

In [125]:
print(df[(df['idxmax'] == 'Entertainment') & (df['max'] > 0)]['outcome'].value_counts(normalize=True))
df[(df['idxmax'] == 'Entertainment') & (df['max'] > 0.8)]['outcome'].value_counts(normalize=True)

entertainment     0.901967
education         0.051847
sport             0.033060
transportation    0.005905
diplomacy         0.003794
art               0.003427
Name: outcome, dtype: float64


entertainment     0.955142
sport             0.019981
education         0.018595
transportation    0.004042
diplomacy         0.001132
art               0.001109
Name: outcome, dtype: float64

### Prediction of LDA 'Sport' topic paragraphs

In [121]:
print(df[(df['idxmax'] == 'Sport') & (df['max'] > 0)]['outcome'].value_counts())
df[(df['idxmax'] == 'Sport') & (df['max'] > 0.8)]['outcome'].value_counts()

sport             31218
education           432
entertainment       222
transportation      139
diplomacy            82
art                  28
Name: outcome, dtype: int64


sport             25355
education           128
entertainment        77
transportation       50
diplomacy            35
art                  17
Name: outcome, dtype: int64

In [124]:
print(df[(df['idxmax'] == 'Sport') & (df['max'] > 0)]['outcome'].value_counts(normalize=True))
df[(df['idxmax'] == 'Sport') & (df['max'] > 0.8)]['outcome'].value_counts(normalize=True)

sport             0.971888
education         0.013449
entertainment     0.006911
transportation    0.004327
diplomacy         0.002553
art               0.000872
Name: outcome, dtype: float64


sport             0.988037
education         0.004988
entertainment     0.003001
transportation    0.001948
diplomacy         0.001364
art               0.000662
Name: outcome, dtype: float64

### Prediction of LDA 'Transportation' topic paragraphs

In [122]:
print(df[(df['idxmax'] == 'Transportation') & (df['max'] > 0)]['outcome'].value_counts())
df[(df['idxmax'] == 'Transportation') & (df['max'] > 0.8)]['outcome'].value_counts()

transportation    35368
education          6752
diplomacy          2009
sport              1528
entertainment       825
art                 526
Name: outcome, dtype: int64


transportation    29330
education          3351
diplomacy          1017
sport               398
art                 166
entertainment       128
Name: outcome, dtype: int64

In [123]:
print(df[(df['idxmax'] == 'Transportation')  & (df['max'] > 0)]['outcome'].value_counts(normalize=True))
df[(df['idxmax'] == 'Transportation')  & (df['max'] > 0.8)]['outcome'].value_counts(normalize=True)

transportation    0.752383
education         0.143635
diplomacy         0.042737
sport             0.032505
entertainment     0.017550
art               0.011190
Name: outcome, dtype: float64


transportation    0.852864
education         0.097441
diplomacy         0.029573
sport             0.011573
art               0.004827
entertainment     0.003722
Name: outcome, dtype: float64

In [265]:
df3[df3['outcome'].notna()]

KeyError: 'outcome'

In [None]:
categorize_text(words=lemmatised_words, number_of_keywords=1, bottom_threshold=0.1, verbose1=False, verbose2=False)

In [None]:
# categorize_group_of_texts 

# def categorize_group_of_texts(lemmatized_wordlists, keywords, embeddings_dict, number_of_keywords=1, bottom_threshold=0.1, verbose1=False, verbose2=False):
#     categories = {}
#     for index, lemmatized_wordlist in tqdm(enumerate(lemmatized_wordlists), total=len(lemmatized_wordlists)):
#         prediction_dict = categorize_text(lemmatized_wordlist=lemmatized_wordlist, keywords=keywords, embeddings_dict=embeddings_dict, number_of_keywords=number_of_keywords, bottom_threshold=bottom_threshold, verbose1=verbose1, verbose2=verbose2)
#         try:
#             categories[prediction_dict['prediction']] +=1
#         except:
#             categories[prediction_dict['prediction']] = 1
    
#     return categories

In [176]:
df[(df['max'] < 0.4)].iloc[6].paragraph

'cycling in cardiff is facilitated by its easy gradients and large parks. in 2005, 4.3% of people commuted to work by cycling, compared to 2% in london and 5% in berlin. however, cyclists in the city appear to be influenced by deterrents to cycling and as a result will need a greater level of improved facilities to increase cycling numbers, according to research by cardiff university.'

In [183]:
df[(df['max'] > 0.9) & (df['idxmax'] == 'Sport')].iloc[87].paragraph

"in 2013, carter participated at the madrid invitational, breaking the meeting record at 9.87, also being his season best. at the moscow world championships, carter won the bronze medal in the 100m in 9.95, behind american justin gatlin (9.85) and jamaican teammate usain bolt (9.77). he won a third world championship relay gold, leading off the jamaican relay team, made up of carter, kemar bailey-cole, nickel ashmeade and bolt. the team won in 37.36 seconds. this win was jamaica's fifth consecutive major championship sprint relay gold, winning the olympics in beijing 2008 and london 2012, and the world titles in berlin 2009, daegu 2011 and moscow 2013."