In [None]:
import pandas as pd
from pprint import pprint
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.express as px
import re
from spacy import displacy
from spacy.symbols import NOUN, DET, ADJ
from matplotlib.pyplot import subplots


### Draw scatter plot

In [None]:
df_processed = pd.read_pickle('./data/dumps/woorden_met_hetofde_tsne_2d_perplexity_100.pickle')
df_only_de_en_het = df_processed[df_processed.det.isin(['de','het'])]


fig = px.scatter(df_only_de_en_het, x='2d_tsnse_x',
                 y='2d_tsnse_y', hover_data={'2d_tsnse_x':False, 'woord':True, 'det' : False, '2d_tsnse_y':False}, color='det',
                 width=1500, height=1000)
fig.update_layout(
    
        
    title={
        'text': "TSNE projection of the words",
        'xanchor': 'center',
         'x':0.5,
        'yanchor': 'top'
    },
    legend_title_text="De = red; Het = purple",
    xaxis_title="1st component",
    yaxis_title="2nd component",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)

# fig.update_traces(hovertemplate='{name}') # 
#fig.write_html("../samarpan-rai.github.io/_includes/2d_tsne_woorden_distributie.html",config = {'responsive': True})
fig.show()

## Extract some meaningful rules

### Seems like if the word ends with -sme then it is always het. Lets verify that.

In [None]:
mask = df_processed.woord.apply(lambda w : 'sme' in w[-3:])
ending_with_sme = df_processed[mask]

In [None]:
ending_with_sme['det'].value_counts()

Seems like one of them is not correct. Is that really so or was it a typo?

In [None]:
ending_with_sme.query('det == "de"')

In [None]:
ending_with_sme.query('det == "het"')

### Seems like if the word ends with -huis then it is always het. Lets verify that.

In [None]:
mask = df_processed.woord.apply(lambda w : 'huis' in w[-4:])
ending_with_huis = df_processed[mask]

In [None]:
ending_with_huis['det'].value_counts()

In [None]:
ending_with_huis.query('det == "het"')

## Compound word ending with het word should always be het

In [None]:
# compoundwords_ending_with_het_word = []
# # EDataFramect all het words and find its corresponding set of compound words ending with it
# het_words =df_processed.query('det=="het"') 
# print(f"Found {het_words.shape[0]} het words ")
# # For each word in that list, find the corresponding compoun word
# het_word_vs_compound_word_result = []
# het_word_list = het_words.woord.tolist()

# for woord in het_word_list:
    
#     # The word must be at least have two characters
#     if len(woord) > 1: 
#         # Make sure that the word being compared is not itself. 
#         mask = df_processed.woord.apply(lambda w : woord in w if (woord != w) else False )
#         ending_with_het_word = df_processed[mask]

#         result_obj = {
#             'woord' : woord,
#             'nr_compound_words_found' : ending_with_het_word.shape[0],
#             'de_het_distribution' : ending_with_het_word.det.value_counts().to_dict()
#         }

#         het_word_vs_compound_word_result.append(result_obj)
    

In [None]:
compoundwords_ending_with_het_word = []
# EDataFramect all het words and find its corresponding set of compound words ending with it
het_words =df_processed.query('det=="het"') 
print(f"Found {het_words.shape[0]} het words ")
# For each word in that list, find the corresponding compoun word
het_word_vs_compound_word_result = []
het_word_list = het_words.woord.tolist()

for woord in het_word_list:
    
    # The word must be at least have two characters
    if len(woord) > 1: 
        # Make sure that the word being compared is not itself. 
        all_other_words = het_words.woord[df_processed.woord!=woord]
        # Add my word to the end of all the other words if it doesn't already end with it 
        theoretical_compound_words = all_other_words.apply(lambda w : w+woord if (w[-len(woord):]!=woord) else woord )
#         print(woord)
#         print(theoretical_compound_words)
        # Find if those theoretical words exists in my corpus
        ending_with_het_word = df_processed[df_processed.woord.isin(theoretical_compound_words)]
        
        
        result_obj = {
            'woord' : woord,
            'nr_compound_words_found' : ending_with_het_word.shape[0],
            'de_het_distribution' : ending_with_het_word.det.value_counts().to_dict()
        }

        het_word_vs_compound_word_result.append(result_obj)
    

In [None]:
df_compoundwords_ending_with_het_word = pd.DataFrame.from_dict(het_word_vs_compound_word_result)

In [None]:
df_compoundwords_ending_with_het_word.drop_duplicates(subset='woord',inplace=True)

In [None]:
df_compoundwords_ending_with_het_word.shape

In [None]:
df_compoundwords_ending_with_het_word['nr_het_words'] = df_compoundwords_ending_with_het_word.de_het_distribution.apply(lambda x : x['het'] if ('het' in x) else 0 )
df_compoundwords_ending_with_het_word['nr_de_words'] = df_compoundwords_ending_with_het_word.de_het_distribution.apply(lambda x : x['de']   if ('de' in x) else 0 )
df_compoundwords_ending_with_het_word['distribution_of_het'] =  df_compoundwords_ending_with_het_word['nr_het_words'] / (df_compoundwords_ending_with_het_word['nr_het_words'] + df_compoundwords_ending_with_het_word['nr_de_words'])
df_compoundwords_ending_with_het_word['distribution_of_de'] =  df_compoundwords_ending_with_het_word['nr_de_words'] / (df_compoundwords_ending_with_het_word['nr_het_words'] + df_compoundwords_ending_with_het_word['nr_de_words'])

In [None]:
df_compoundwords_ending_with_het_word_not_comparing_them_selves = df_compoundwords_ending_with_het_word.query('nr_het_words > 1')

In [None]:
df_compoundwords_ending_with_het_word_not_comparing_them_selves.shape

In [None]:
most_het_words_with_many_het_compound_word = df_compoundwords_ending_with_het_word_not_comparing_them_selves.sort_values(by='nr_het_words',ascending=False).head(10)
most_het_words_with_many_het_compound_word.head()

In [None]:
fig, ax = subplots()
most_het_words_with_many_het_compound_word[['woord','nr_het_words','nr_de_words']].\
    plot.\
    barh(x='woord',
         title='Most compound het words produced per het word', ax=ax, figsize=(20,10))
ax.invert_yaxis()
ax.legend(["Number of Het words", "Number of De words"]);
fig.savefig('../samarpan-rai.github.io/assets/img/most_compound_het_words_produced_per_het_word.png')

####  Question : On average, what percentage of compound word ending with het word are also het?

In [None]:
percentage = df_compoundwords_ending_with_het_word_not_comparing_them_selves.distribution_of_het.mean()

In [None]:
print(f"On average {(percentage*100)}% of het word's compound word are also het word")

## What about compound word ending with de word ? Are they also always de?

In [None]:
# EDataFramect all het words and find its corresponding set of compound words ending with it
de_words =df_processed.query('det=="de"') 
print(f"Found {de_words.shape[0]} de words ")
# For each word in that list, find the corresponding compoun word
de_word_vs_compound_word_result = []
de_word_list = de_words.woord.tolist()

for woord in de_word_list:
    
    # The word must be at least have two characters
    if len(woord) > 1: 
        # Make sure that the word being compared is not itself. 
        all_other_words = de_words.woord[df_processed.woord!=woord]
        # Add my word to the end of all the other words if it doesn't already end with it 
        theoretical_compound_words = all_other_words.apply(lambda w : w+woord if (w[-len(woord):]!=woord) else "" )
#         print(woord)
#         print(theoretical_compound_words)
        # Find if those theoretical words exists in my corpus
        ending_with_de_word = df_processed[df_processed.woord.isin(theoretical_compound_words)]
        
        
        result_obj = {
            'woord' : woord,
            'nr_compound_words_found' : ending_with_de_word.shape[0],
            'de_het_distribution' : ending_with_de_word.det.value_counts().to_dict()
        }

        de_word_vs_compound_word_result.append(result_obj)
    

In [None]:
df_compoundwords_ending_with_de_word = pd.DataFrame.from_dict(de_word_vs_compound_word_result)

In [None]:
df_compoundwords_ending_with_de_word.drop_duplicates(subset='woord',inplace=True)

In [None]:
df_compoundwords_ending_with_de_word.shape

In [None]:
df_compoundwords_ending_with_de_word.head()

In [None]:
df_compoundwords_ending_with_de_word['nr_het_words'] = df_compoundwords_ending_with_de_word.de_het_distribution.apply(lambda x : x['het'] if ('het' in x) else 0 )
df_compoundwords_ending_with_de_word['nr_de_words'] = df_compoundwords_ending_with_de_word.de_het_distribution.apply(lambda x : x['de']   if ('de' in x) else 0 )
df_compoundwords_ending_with_de_word['distribution_of_het'] =  df_compoundwords_ending_with_de_word['nr_het_words'] / (df_compoundwords_ending_with_de_word['nr_het_words'] + df_compoundwords_ending_with_de_word['nr_de_words'])
df_compoundwords_ending_with_de_word['distribution_of_de'] =  df_compoundwords_ending_with_de_word['nr_de_words'] / (df_compoundwords_ending_with_de_word['nr_het_words'] + df_compoundwords_ending_with_de_word['nr_de_words'])

In [None]:
df_compoundwords_ending_with_de_word_not_comparing_them_selves = df_compoundwords_ending_with_de_word.query('nr_de_words > 1')

In [None]:
df_compoundwords_ending_with_de_word_not_comparing_them_selves.shape

In [None]:
most_de_words_with_many_de_compound_word = df_compoundwords_ending_with_de_word_not_comparing_them_selves.sort_values(by='nr_de_words',ascending=False).head(10)
most_de_words_with_many_de_compound_word.head()

In [None]:
fig, ax = subplots()
most_de_words_with_many_de_compound_word[['woord','nr_het_words','nr_de_words']].\
    plot.\
    barh(x='woord',
         title='Most compound de words produced per de word', ax=ax, figsize=(20,10))
ax.invert_yaxis()
ax.legend(["Number of Het words", "Number of De words"]);
fig.savefig('../samarpan-rai.github.io/assets/img/most_compound_de_words_produced_per_de_word.png')

In [None]:
suffix='weg'
mask = df_processed.woord.apply(lambda w : suffix in w[-len(suffix):])
ending_with_suffix = df_processed[mask]

####  Question : On average, what percentage of compound word ending with het word are also het?

In [None]:
percentage = df_compoundwords_ending_with_de_word_not_comparing_them_selves.distribution_of_de.mean()

In [None]:
print(f"On average {(percentage*100)}% of de word's compound word are also de word")

###  Find words ending with 'bied' that are de

In [None]:
suffix='gebied'
mask = df_processed.woord.apply(lambda w : suffix in w[-len(suffix):])
ending_with_bied = df_processed[mask]

In [None]:
ending_with_bied

In [None]:
ending_with_bied.query('det == "de"')

###  Find words ending with 'gen' that are de

In [None]:
suffix='gen'
mask = df_processed.woord.apply(lambda w : suffix in w[-len(suffix):])
ending_with_suffix = df_processed[mask]

In [None]:
ending_with_suffix.head()

###  Find words ending with 'jes' that are het

In [None]:
suffix='je'
mask = df_processed.woord.apply(lambda w : suffix in w[-len(suffix):])
ending_with_suffix = df_processed[mask]

In [None]:
ending_with_suffix.head()

In [None]:
ending_with_suffix.det.value_counts()

In [None]:
ending_with_suffix.query('det == "de"')