In [1]:
import regex
import numpy as np
import pandas as pd

from collections import Counter

### References


In [2]:
with open('en_processed.txt', 'r') as f:
    en = pd.read_json(f, orient='table')
    
with open('ru_processed.txt', 'r') as f:
    ru = pd.read_json(f, orient='table')

In [3]:
def print_sample(en_df, ru_df, num=3):
    display(en_df.head(num))
    display(ru_df.head(num))

In [4]:
print_sample(en, ru)

Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters,stopwords,amt_words,amt_chars
0,1003024039621820417,"[fabulous, leadership]","[🇺🇲, 🇺🇲, 🇺🇲, 😊, 😊]",5,1,1,1.0,0,0,2,18
1,1005180474,"[america, drain, expect, biden, good]",[],0,1,1,1.0,0,4,5,27
2,1006460630692450304,"[glad, enjoy, beautiful, south, west]",[😍],1,1,0,0.0,0,1,5,27


Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters,stopwords,amt_words,amt_chars
0,1005065863060475904,"[выносить, приговор]",[],0,1,1,1.0,0,3,2,16
1,1012429847531065349,"[добрый, вечер, радий, италия, куртка, коротки...",[],0,2,2,1.0,0,10,15,94
2,1013485198787440640,"[это, понимать, хороший, новость]",[],0,1,1,1.0,0,2,4,25


In [5]:
en_corpus = Counter()
ru_corpus = Counter()

en.loc[:, 'text'].map(en_corpus.update)
ru.loc[:, 'text'].map(ru_corpus.update)

0      None
1      None
2      None
3      None
4      None
       ... 
355    None
356    None
357    None
358    None
359    None
Name: text, Length: 360, dtype: object

In [6]:
print(f'Size corpus EN: {len(en_corpus)}, RU: {len(ru_corpus)}')

Size corpus EN: 2689, RU: 2738


In [7]:
print(f'Top 10 EN:\n {en_corpus.most_common(40)}\n RU:\n {ru_corpus.most_common(40)}')

Top 10 EN:
 [('good', 86), ('get', 61), ('day', 54), ('people', 53), ('im', 48), ('know', 46), ('say', 45), ('think', 42), ('want', 41), ('go', 41), ('need', 39), ('russia', 37), ('thats', 34), ('trump', 31), ('even', 31), ('morning', 30), ('look', 30), ('make', 30), ('right', 29), ('lie', 25), ('take', 25), ('love', 25), ('great', 24), ('country', 24), ('well', 24), ('also', 24), ('see', 24), ('state', 23), ('back', 23), ('let', 23), ('thank', 22), ('work', 22), ('putin', 22), ('china', 22), ('time', 22), ('biden', 21), ('care', 21), ('come', 20), ('call', 20), ('way', 20)]
 RU:
 [('это', 128), ('свой', 43), ('собчак', 39), ('говорить', 30), ('который', 30), ('человек', 29), ('жить', 25), ('весь', 25), ('очень', 24), ('мочь', 24), ('светов', 23), ('россия', 23), ('просто', 23), ('украина', 22), ('путин', 21), ('год', 18), ('хотеть', 15), ('почему', 15), ('знать', 15), ('наш', 14), ('аудитория', 14), ('хороший', 13), ('нужно', 13), ('соловьёв', 13), ('видео', 13), ('деньга', 13), ('бол

### NOTE:
* Fact that full sentence analysis for POS tagging was necessary for English but not Russian

* Top 10 words reveal wide distribution amongst POS for EN, while mostly exclusive for RU

* EN regularly has >10 definitions per word, RU <= 10

### Statistics

In [8]:
print_sample(en, ru)

Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters,stopwords,amt_words,amt_chars
0,1003024039621820417,"[fabulous, leadership]","[🇺🇲, 🇺🇲, 🇺🇲, 😊, 😊]",5,1,1,1.0,0,0,2,18
1,1005180474,"[america, drain, expect, biden, good]",[],0,1,1,1.0,0,4,5,27
2,1006460630692450304,"[glad, enjoy, beautiful, south, west]",[😍],1,1,0,0.0,0,1,5,27


Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters,stopwords,amt_words,amt_chars
0,1005065863060475904,"[выносить, приговор]",[],0,1,1,1.0,0,3,2,16
1,1012429847531065349,"[добрый, вечер, радий, италия, куртка, коротки...",[],0,2,2,1.0,0,10,15,94
2,1013485198787440640,"[это, понимать, хороший, новость]",[],0,1,1,1.0,0,2,4,25


In [9]:
# Description EN vs RU of the various columns
pd.concat([en.loc[:, 'emoji_len'].describe(), 
           ru.loc[:, 'emoji_len'].describe(),
           en.loc[:, 'amt_tweets'].describe(), 
           ru.loc[:, 'amt_tweets'].describe(),
           en.loc[:, 'puncts'].describe(), 
           ru.loc[:, 'puncts'].describe(),
           en.loc[:, 'avg_puncts'].describe(), 
           ru.loc[:, 'avg_puncts'].describe(),
           en.loc[:, 'repeat_letters'].describe(), 
           ru.loc[:, 'repeat_letters'].describe(),
           en.loc[:, 'stopwords'].describe(), 
           ru.loc[:, 'stopwords'].describe(),
           en.loc[:, 'amt_words'].describe(), 
           ru.loc[:, 'amt_words'].describe(),
           en.loc[:, 'amt_chars'].describe(),
           ru.loc[:, 'amt_chars'].describe()],
          axis=1, 
          keys=['EN_emoji_len', 'RU_emoji_len',
                'EN_amt_tweets', 'RU_amt_tweets',
                'EN_puncts', 'RU_puncts', 
                'EN_avg_puncts', 'RU_avg_puncts', 
                'EN_repeat_letters', 'RU_repeat_letters', 
                'EN_stopwords', 'RU_stopwords', 
                'EN_amt_words', 'RU_amt_words', 
                'EN_amt_chars', 'RU_amt_chars'])

Unnamed: 0,EN_emoji_len,RU_emoji_len,EN_amt_tweets,RU_amt_tweets,EN_puncts,RU_puncts,EN_avg_puncts,RU_avg_puncts,EN_repeat_letters,RU_repeat_letters,EN_stopwords,RU_stopwords,EN_amt_words,RU_amt_words,EN_amt_chars,RU_amt_chars
count,563.0,360.0,563.0,360.0,563.0,360.0,563.0,360.0,563.0,360.0,563.0,360.0,563.0,360.0,563.0,360.0
mean,0.509769,0.247222,1.383659,1.538889,2.666075,3.119444,0.959178,1.102358,0.030195,0.047222,3.506217,10.169444,13.348135,14.186111,74.648313,95.791667
std,1.881118,0.933734,1.296814,1.383806,4.466806,4.698089,0.766526,0.891209,0.20003,0.237191,6.186992,16.031538,21.07476,19.660385,119.803371,134.488315
min,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,8.0
25%,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,3.0,4.0,4.0,22.0,29.0
50%,0.0,0.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0,0.0,2.0,6.0,8.0,7.0,41.0,51.5
75%,0.0,0.0,1.0,1.0,3.0,4.0,1.0,1.0,0.0,0.0,4.0,12.0,15.0,17.0,85.0,110.5
max,23.0,9.0,18.0,13.0,76.0,43.0,7.0,7.0,2.0,2.0,100.0,166.0,351.0,198.0,2057.0,1406.0


### Emoji Analysis

In [10]:
en_emoji = en.copy()
ru_emoji = ru.copy()

In [11]:
en.loc[:, 'emoji_len'].describe()/ru.loc[:, 'emoji_len'].describe()

count    1.563889
mean     2.061987
std      2.014620
min           NaN
25%           NaN
50%           NaN
75%           NaN
max      2.555556
Name: emoji_len, dtype: float64

In [13]:
pd.concat([
    en.loc[:, 'puncts'].describe()/ru.loc[:, 'puncts'].describe(),
    en.loc[:, 'avg_puncts'].describe()/ru.loc[:, 'avg_puncts'].describe()],
    axis=1,
    keys=['EN:RU_puncts', 'EN:RU_avg_puncts'])

Unnamed: 0,EN:RU_puncts,EN:RU_avg_puncts
count,1.563889,1.563889
mean,0.854663,0.870115
std,0.950771,0.860097
min,,
25%,1.0,1.0
50%,1.0,1.0
75%,0.75,1.0
max,1.767442,1.0
