In [33]:
import pandas as pd
import os
import json
from glob import glob

from yaml import load

try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper

from tqdm import tqdm
from collections import defaultdict
import numpy as np

In [4]:
def load_languages():
    with open("../data/config.yaml") as f:
        langs = load(f, Loader=Loader)["langs"]
    return langs

In [7]:
test_path = "../data/tp/eu/cs/cs_Twitter.csv"

In [8]:
df = pd.read_csv(test_path)

In [10]:
df["year"]= df["date"].str[:4]

In [17]:

df.groupby("year").agg("count").to_dict()

{'resourceId': {'2014': 486,
  '2015': 6734,
  '2016': 6496,
  '2017': 5683,
  '2018': 11682,
  '2019': 7803},
 'date': {'2014': 486,
  '2015': 6734,
  '2016': 6496,
  '2017': 5683,
  '2018': 11682,
  '2019': 7803},
 'countryCode': {'2014': 293,
  '2015': 4410,
  '2016': 4211,
  '2017': 3297,
  '2018': 5390,
  '2019': 3284},
 'preprocessed_text': {'2014': 486,
  '2015': 6734,
  '2016': 6496,
  '2017': 5683,
  '2018': 11682,
  '2019': 7803},
 'month': {'2014': 486,
  '2015': 6734,
  '2016': 6496,
  '2017': 5683,
  '2018': 11682,
  '2019': 7803},
 'LEN': {'2014': 486,
  '2015': 6734,
  '2016': 6496,
  '2017': 5683,
  '2018': 11682,
  '2019': 7803}}

In [24]:
def get_stats():
    langs = load_languages()
    
    data_vol_dict= defaultdict(dict)
    data_len_dict =defaultdict(dict)
    
    data_folder = "../data/tp"
    for query in ["eu","un"]:
        data_vol_dict[query] = defaultdict(dict)
        data_len_dict[query] = defaultdict(list)
        
        for lang in tqdm(langs):
            general_path = f"{data_folder}/{query}/{lang}/{lang}_Twitter"
            if os.path.exists(general_path):
                print(f"processing{general_path}")
                orig_file = f"{general_path}.csv"
                df = pd.read_csv(orig_file)
                df["year"]= df["date"].str[:4]
                data_vol_dict[query][lang] = df.groupby("year").agg("count").to_dict()["resourceId"]
                data_len_dict[query][lang]+= df["LEN"].tolist()
    return data_vol_dict, data_len_dict
    

In [25]:
data_vol_dict, data_len_dict = get_stats()

  8%|████▉                                                           | 1/13 [00:00<00:01,  7.93it/s]

processing../data/tp/eu/cs/cs_Twitter
processing../data/tp/eu/da/da_Twitter


 15%|█████████▊                                                      | 2/13 [00:00<00:04,  2.65it/s]

processing../data/tp/eu/de/de_Twitter


 23%|██████████████▊                                                 | 3/13 [00:03<00:14,  1.50s/it]

processing../data/tp/eu/es/es_Twitter


 46%|█████████████████████████████▌                                  | 6/13 [00:06<00:08,  1.24s/it]

processing../data/tp/eu/hu/hu_Twitter
processing../data/tp/eu/it/it_Twitter


 69%|████████████████████████████████████████████▎                   | 9/13 [00:08<00:03,  1.12it/s]

processing../data/tp/eu/nl/nl_Twitter


 77%|████████████████████████████████████████████████▍              | 10/13 [00:09<00:03,  1.00s/it]

processing../data/tp/eu/pl/pl_Twitter


 85%|█████████████████████████████████████████████████████▎         | 11/13 [00:10<00:01,  1.16it/s]

processing../data/tp/eu/sk/sk_Twitter
processing../data/tp/eu/sv/sv_Twitter


100%|███████████████████████████████████████████████████████████████| 13/13 [00:10<00:00,  1.20it/s]
  0%|                                                                        | 0/13 [00:00<?, ?it/s]

processing../data/tp/un/cs/cs_Twitter
processing../data/tp/un/da/da_Twitter
processing../data/tp/un/de/de_Twitter


 23%|██████████████▊                                                 | 3/13 [00:00<00:01,  5.28it/s]

processing../data/tp/un/es/es_Twitter


 46%|█████████████████████████████▌                                  | 6/13 [00:05<00:06,  1.05it/s]

processing../data/tp/un/hu/hu_Twitter
processing../data/tp/un/it/it_Twitter


 69%|████████████████████████████████████████████▎                   | 9/13 [00:05<00:02,  1.65it/s]

processing../data/tp/un/pl/pl_Twitter
processing../data/tp/un/sk/sk_Twitter
processing../data/tp/un/sv/sv_Twitter


100%|███████████████████████████████████████████████████████████████| 13/13 [00:05<00:00,  2.20it/s]


In [28]:
df_eu = pd.DataFrame.from_dict(data_vol_dict["eu"])

In [30]:
df_eu.to_csv("../data/stats/eu_twitter_stats_original.csv")

In [31]:
df_un = pd.DataFrame.from_dict(data_vol_dict["un"])
df_un.to_csv("../data/stats/un_twitter_stats_original.csv")

In [35]:
lang_len_dict=defaultdict(dict)
for query, query_dict in data_len_dict.items():
    lang_len_dict[query]= defaultdict(dict)
    for lang, lens in query_dict.items():
        lang_len_dict[query][lang]= {"min":np.min(lens),
                                     "median":np.median(lens),"max":np.max(lens)}                       

In [38]:
df_len_eu = pd.DataFrame.from_dict(lang_len_dict["eu"],orient="index")

In [40]:
df_len_un = pd.DataFrame.from_dict(lang_len_dict["un"],orient="index")

In [41]:
df_len_eu.to_csv("../data/stats/eu_twitter_text_len.csv")
df_len_un.to_csv("../data/stats/un_twitter_text_len.csv")

In [51]:
def get_stats_training_data():
    langs = load_languages()
    
    data_vol_dict= defaultdict(dict)
    
    data_folder = "../data/tp"
    for query in ["eu","un"]:
        data_vol_dict[query] = defaultdict(dict)
        data_len_dict[query] = defaultdict(list)
        
        for lang in tqdm(langs):
            general_path = f"{data_folder}/{query}/{lang}/{lang}_Twitter"
            if os.path.exists(general_path):
                print(f"processing{general_path}")
                orig_file = f"{general_path}/{lang}_Twitter.csv"
                df = pd.read_csv(orig_file)
                df["year"]= df["date"].str[:4]
                data_vol_dict[query][lang] = df.groupby("year").agg("count").to_dict()["resourceId"]
    return data_vol_dict, data_len_dict
    

In [52]:
data_vol_dict, data_len_dict = get_stats_training_data()

  0%|                                                                        | 0/13 [00:00<?, ?it/s]

processing../data/tp/eu/cs/cs_Twitter
processing../data/tp/eu/da/da_Twitter


 15%|█████████▊                                                      | 2/13 [00:00<00:01,  7.68it/s]

processing../data/tp/eu/de/de_Twitter


 23%|██████████████▊                                                 | 3/13 [00:01<00:04,  2.33it/s]

processing../data/tp/eu/es/es_Twitter


 46%|█████████████████████████████▌                                  | 6/13 [00:01<00:01,  3.77it/s]

processing../data/tp/eu/hu/hu_Twitter
processing../data/tp/eu/it/it_Twitter


 69%|████████████████████████████████████████████▎                   | 9/13 [00:01<00:00,  5.08it/s]

processing../data/tp/eu/nl/nl_Twitter


 77%|████████████████████████████████████████████████▍              | 10/13 [00:02<00:00,  4.31it/s]

processing../data/tp/eu/pl/pl_Twitter
processing../data/tp/eu/sk/sk_Twitter
processing../data/tp/eu/sv/sv_Twitter


100%|███████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  4.91it/s]
 23%|██████████████▊                                                 | 3/13 [00:00<00:00, 21.91it/s]

processing../data/tp/un/cs/cs_Twitter
processing../data/tp/un/da/da_Twitter
processing../data/tp/un/de/de_Twitter
processing../data/tp/un/es/es_Twitter


100%|███████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 15.03it/s]

processing../data/tp/un/hu/hu_Twitter
processing../data/tp/un/it/it_Twitter
processing../data/tp/un/pl/pl_Twitter
processing../data/tp/un/sk/sk_Twitter
processing../data/tp/un/sv/sv_Twitter





In [54]:
df_eu = pd.DataFrame.from_dict(data_vol_dict["eu"])
df_eu.to_csv("../data/stats/eu_twitter_stats_training.csv")

In [55]:
df_un = pd.DataFrame.from_dict(data_vol_dict["un"])
df_un.to_csv("../data/stats/un_twitter_stats_training.csv")

In [44]:
def get_stats_results():
    langs = load_languages()
    
    data_vol_dict= defaultdict(dict)
    data_len_dict =defaultdict(dict)
    
    data_folder = "../output/tp"
    for query in ["eu","un"]:
        data_vol_dict[query] = defaultdict(dict)
        data_len_dict[query] = defaultdict(list)
        
        for lang in tqdm(langs):
            general_path = f"{data_folder}/{query}/{lang}/{lang}_Twitter"
            for file in glob(f"{general_path}/*/{lang}_etm.csv"):
                df = pd.read_csv(file)
                df["year"]= df["date"].str[:4]
                data_vol_dict[query][lang] = df.groupby("year").agg("count").to_dict()["resourceId"]
                data_len_dict[query][lang]+= df["LEN"].tolist()
    return data_vol_dict, data_len_dict

In [45]:
data_vol_dict, data_len_dict = get_stats_results()

100%|███████████████████████████████████████████████████████████████| 13/13 [00:17<00:00,  1.35s/it]
100%|███████████████████████████████████████████████████████████████| 13/13 [00:09<00:00,  1.36it/s]


In [46]:
df_eu = pd.DataFrame.from_dict(data_vol_dict["eu"])
df_eu.to_csv("../data/stats/eu_twitter_stats_results.csv")

In [47]:
df_un = pd.DataFrame.from_dict(data_vol_dict["un"])
df_un.to_csv("../data/stats/un_twitter_stats_results.csv")

In [48]:
df_eu

Unnamed: 0,cs,da,de,es,hu,it,nl,pl,sk,sv
2014,485.0,6953,30527.0,55136.0,281.0,22745.0,10218.0,789.0,7.0,12459.0
2015,6734.0,47912,189031.0,335694.0,2195.0,77929.0,88132.0,12156.0,364.0,53170.0
2016,6492.0,43053,210302.0,496226.0,1932.0,71828.0,92739.0,12719.0,31.0,34770.0
2017,5672.0,25663,151555.0,181542.0,1150.0,89210.0,91533.0,31027.0,,25614.0
2018,11676.0,30293,311733.0,187452.0,1722.0,210559.0,145488.0,36832.0,,46928.0
2019,7802.0,25659,197235.0,123183.0,932.0,143482.0,143736.0,17026.0,,39769.0
2020,,21166,,,,,,,,
