In [174]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import json


## Counts for a Bubble Chart Dictionary

In [96]:
vectorizer = CountVectorizer(decode_error='ignore', ngram_range=(1,1), max_features=1000)
corpus = open('../data/proust.txt')
vectors = vectorizer.fit_transform(corpus).toarray()
counts = vectors.sum(axis= 0)
words = vectorizer.get_feature_names_out()

pd.DataFrame([words,counts]).transpose().sort_values(by=1, ascending=False)

Unnamed: 0,0,1
225,de,7794
452,la,3937
321,et,3898
414,il,3355
714,que,3139
...,...,...
896,trouva,15
730,rapport,15
808,serais,15
485,longue,15


## Counts for a Syntax Sunburst

In [138]:
vectorizer = CountVectorizer(decode_error='ignore', ngram_range=(4,4), max_features=1000)
corpus = open('../data/proust.txt')
vectors = vectorizer.fit_transform(corpus).toarray()
counts = vectors.sum(axis= 0)
words = vectorizer.get_feature_names_out()

dff = pd.DataFrame([words,counts]).transpose().sort_values(by=1, ascending=False).rename(columns={0:'text',1:'count'})

dff[dff['count']>1]

Unnamed: 0,text,count
745,qu est ce que,22
214,de ma grand mère,21
320,est ce que vous,20
569,mme de saint euverte,15
414,je ne sais pas,15
...,...,...
236,de nous qu il,2
603,nuit là dans ma,2
602,nouvelles de son amour,2
237,de parler de son,2


In [139]:
dff[dff['text'].str.split().str[0] == 'fillette']

Unnamed: 0,text,count
356,fillette la voix brève,2


In [189]:
def hierarchical_sunburst_data(df):

    


    tokens = [x.split() for x in list(df['text'])]
    count = [x for x in list(df['count'])]

    nests = [{"name": x, "children":[]} for x in list(set([x[0] for x in tokens]))]
    
    for x in nests:
        x['children'] = [{"name":word[1],"children":[]} for word in tokens if word[0] == x['name']]
        for y in x['children']:
            y['children'] = [{"name":word[2],"children":[]} for word in tokens if (word[0] == x['name']) & (word[1] == y['name'])]
            for z in y['children']:
                z['children'] = [{"name":(' ').join(tokens[idx]),"value":int(count[idx])} for idx in range(len(tokens)) if (tokens[idx][0] == x['name']) & (tokens[idx][1] == y['name']) & (tokens[idx][2] == z['name'])]

    data = {"name": 'French N-Grams', 'children':nests}
    return data

test = hierarchical_sunburst_data(dff)

In [190]:
test

{'name': 'French N-Grams',
 'children': [{'name': 'fillette',
   'children': [{'name': 'la',
     'children': [{'name': 'voix',
       'children': [{'name': 'fillette la voix brève', 'value': 2}]}]}]},
  {'name': 'dire',
   'children': [{'name': 'qu',
     'children': [{'name': 'elle',
       'children': [{'name': 'dire qu elle croyait', 'value': 2}]}]},
    {'name': 'tout',
     'children': [{'name': 'de',
       'children': [{'name': 'dire tout de suite', 'value': 2}]}]}]},
  {'name': 'on',
   'children': [{'name': 'ne',
     'children': [{'name': 'peut',
       'children': [{'name': 'on ne peut pas', 'value': 8}]},
      {'name': 'pourrait',
       'children': [{'name': 'on ne pourrait pas', 'value': 3}]},
      {'name': 'connaît',
       'children': [{'name': 'on ne connaît pas', 'value': 3}]}]},
    {'name': 'ne',
     'children': [{'name': 'peut',
       'children': [{'name': 'on ne peut pas', 'value': 8}]},
      {'name': 'pourrait',
       'children': [{'name': 'on ne pourrait 

In [191]:
dumps = json.dumps(test, indent=1)  

with open("../data/proust_ngrams.json", "w") as outfile:
    outfile.write(dumps)