### References
Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O’Reilly Media Inc.

https://www.kaggle.com/alxmamaev/how-to-easy-preprocess-russian-text

https://python-school.ru/nlp-text-preprocessing/

https://pymorphy2.readthedocs.io/en/latest/user/guide.html

https://stackoverflow.com/a/49242754/13557629 (finding emojis)

https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#wordnetlemmatizer

In [1]:
import json
import regex
import yaml
import logging
from logging import config
import numpy as np 
import pandas as pd
from datetime import datetime as dt
from time import sleep
from pathlib import Path

import spacy
spacy.require_gpu()
from torch.utils import dlpack

In [2]:
import importlib
import importlib.util as imp
import sys

spec_conn = imp.spec_from_file_location(
    'twitter_connection',
    '../twitter-connection/__init__.py')
sc = imp.module_from_spec(spec_conn)
sys.modules[spec_conn.name] = sc
spec_conn.loader.exec_module(sc)

spec_data = imp.spec_from_file_location(
    'twitter_data',
    '../twitter-connection/twitter_data/__init__.py')
sd = imp.module_from_spec(spec_data)
sys.modules[spec_data.name] = sd
spec_data.loader.exec_module(sd)

from twitter_connection.util import utils
from twitter_connection import processing
from twitter_data import twitter_data

In [3]:
try:
    with open(utils.get_project_root()/'log_config.yml', 'r') as f:
        lc = yaml.safe_load(f)
        config.dictConfig(lc)
        
        logger = logging.getLogger('processing')
except Exception as e:
    print(e.args)

"""
Reload module
"""
importlib.reload(twitter_data)

In [4]:
gen_conf = utils.get_config()
conf = utils.get_config('p')

In [5]:
es_conj_path = utils.get_project_root()/gen_conf['file_paths']['verb_conjug']
cleaned_folder = '12062021'
cleaned_path = utils.get_save_path(data_from='tweets', where='c', lang='es', is_test=False)/cleaned_folder
processed_folder = '12062021'
processed_path = utils.get_save_path(data_from='tweets', where='p', lang='es', is_test=False)/processed_folder

In [6]:
es_conjugs = pd.read_excel(es_conj_path)
display(es_conjugs.head(2))

es_verbs = set(es_conjugs['verb'].to_numpy())

Unnamed: 0,verb_type,verb,indicativo,imperativo,subjuntivo,gerundio,gerundio_compuesto,infinitivo,infinitivo_compuesto,participio_pasado
0,Stative,ver,veía visto verías vi vimos verían ves v...,vean ve vea veamos ved,veáis visto vieras vieren viesen veas vi...,viendo,visto,ver,visto,visto
1,Stative,jurar,jurarán juramos jurarías jurabas juraría ...,jurad jura juren jure juremos,jurare jurareis jurase jurara juraren jur...,jurando,jurado,jurar,jurado,jurado


In [8]:
tweets = pd.read_csv(es_cleaned_path/'tweets.csv', sep='~', lineterminator='\n')
display(tweets.head(3))
display(tweets.info())

Unnamed: 0,created_at,text_orig,author_id,lang,tweet_id,tweet_place_id,referenced_tweets,mentions,text_norm,retweet_reply_like_quote
0,2021-11-08 03:15:45+00:00,Esta derrota de Quindio confirma que el Superd...,141323312.0,es,1.457547e+18,0116b409205a5237,,,Esta derrota de Quindio confirma que el Superd...,"(0, 0, 4, 0)"
1,2021-11-08 03:15:11+00:00,Muajaja ese broder confirmó lo q les dije... L...,49454158.0,es,1.457547e+18,011455904ec2ab81,,,Muajaja ese broder confirmo lo q les dije... L...,"(0, 0, 0, 0)"
2,2021-11-08 03:08:56+00:00,@gabrielintica @rebecajc Mis 22 años siendo c-...,26833188.0,es,1.457546e+18,6eb95eddb81a6b4b,['1457523752795844611'],"[{'start': 0, 'end': 14, 'username': 'gabrieli...",Mis 22 anos siendo c-nora lo confirman.,"(0, 0, 0, 0)"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 408362 entries, 0 to 408361
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   created_at                408362 non-null  object 
 1   text_orig                 408362 non-null  object 
 2   author_id                 408362 non-null  float64
 3   lang                      408362 non-null  object 
 4   tweet_id                  408362 non-null  float64
 5   tweet_place_id            408358 non-null  object 
 6   referenced_tweets         253215 non-null  object 
 7   mentions                  259972 non-null  object 
 8   text_norm                 408362 non-null  object 
 9   retweet_reply_like_quote  408362 non-null  object 
dtypes: float64(2), object(8)
memory usage: 31.2+ MB


None

### Running through spacy pipeline

In [9]:
# Disable 'ner' (Named Entity Recognizer)
nlp_es = spacy.load(conf['spacy']['es'], disable=['ner'])
# nlp_pt = spacy.load(conf['spacy']['pt'])

KeyboardInterrupt: 

In [None]:
def get_verbs(tokenized):
    verbs = ', '.join(set(t.lemma_ for t in tokenized if (t.pos_=='VERB') and (t.lemma_ in es_verbs)))
    return verbs if len(verbs)>0 else None

In [None]:
def get_dep(tokenized):
    return ' '.join([f'{t.text}({t.dep_ if t.pos_!="PUNCT" else ""})' for t in tokenized])

In [None]:
def get_details(tokenized):
    return ' '.join([f'{t.text}[{t.lemma_}|{t.pos_}|{t.is_stop}]' for t in tokenized if t.pos_!='PUNCT'])

In [None]:
def have_verbs(df):
    have = df['text_norm'].apply(get_verbs).notna()
    return df.loc[have, :].reset_index(drop=True)

In [None]:
def save_batch(tokenized: list, name):
    batch = have_verbs(pd.concat(tokenized, ignore_index=True))
    
    verbs = batch['text_norm'].apply(get_verbs).rename('verbs')
    dep = batch['text_norm'].apply(get_dep).rename('dependencies')
    details = batch['text_norm'].apply(get_details).rename('lemma_pos_stopword')
    
    batch = pd.concat([verbs, batch.loc[:, 'tweet_id'], dep, details], axis=1)
    
    utils.save_csv(es_save_path, batch, name+'.csv')

In [None]:
spacy_processed = []
processed = 0
saved = 0

In [None]:
utils.make_dir(es_save_path)

In [None]:
err = 0
batch_size = 500
batches = int(np.ceil(tweets.shape[0]/batch_size))

logger.info(f'Running {tweets.shape[0]-processed} tweets through spaCy pipeline')
logger.debug(f'Batch size: {batch_size}, batches: {batches}')

for i, d in enumerate(np.array_split(tweets.loc[:, ['tweet_id', 'text_norm']], batches)):
    # Tweets already processed
    if i*batch_size < processed:
        continue
    
    try:
        spacy_processed.append(
            pd.concat([d['tweet_id'], d['text_norm'].apply(nlp_es)], 
                      axis=1))
        
        processed+=batch_size
        logger.debug(f'Processed: {processed}')
        
        if (processed%10000)<batch_size:
            logger.debug(f'Saving batch of {sum([p.shape[0] for p in spacy_processed])}')
            # Save progress and free up memory
            save_batch(spacy_processed, name=f'tweets-processed-{saved}')
            saved+=1
            
            spacy_processed.clear()

    except Exception as e:
        err+=1
        print(f'{i} is broken: {e.args}')
        
        if err>2:
            break
        pass

In [None]:
save_batch(spacy_processed, name=f'tweets-processed-{saved}')
saved+=1
            
spacy_processed.clear()

### Merging Processed Batches

In [18]:
importlib.reload(utils)

<module 'twitter_connection.util.utils' from '/home/rimov/Documents/Code/NLP/lin-que-dropping/processing/../twitter-connection/util/utils.py'>

In [19]:
processed_tweets_path = Path(es_save_path).rglob('*processed*.csv')

In [25]:
processed_tweets = pd.concat([utils.get_csv(p) for p in processed_tweets_path]).reset_index(drop=True)

ValueError: No objects to concatenate

In [26]:
processed_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298492 entries, 0 to 298491
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   verbs               298492 non-null  string 
 1   tweet_id            298492 non-null  float64
 2   dependencies        298492 non-null  string 
 3   lemma_pos_stopword  298492 non-null  string 
dtypes: float64(1), string(3)
memory usage: 9.1 MB


In [42]:
merged = pd.merge(processed_tweets, tweets, how='left', on='tweet_id')
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 298492 entries, 0 to 298491
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   verbs                     298492 non-null  string 
 1   tweet_id                  298492 non-null  float64
 2   dependencies              298492 non-null  string 
 3   lemma_pos_stopword        298492 non-null  string 
 4   created_at                298492 non-null  object 
 5   text_orig                 298492 non-null  object 
 6   author_id                 298492 non-null  float64
 7   lang                      298492 non-null  object 
 8   tweet_place_id            298491 non-null  object 
 9   referenced_tweets         180174 non-null  object 
 10  mentions                  185057 non-null  object 
 11  text_norm                 298492 non-null  object 
 12  retweet_reply_like_quote  298492 non-null  object 
dtypes: float64(2), object(8), string(3)
memory u

In [43]:
# Rename any misnamed columns
merged.rename(columns={'author_id': 'user_id'}, inplace=True)

In [44]:
merged = merged.loc[:, conf['col_order']]
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 298492 entries, 0 to 298491
Data columns (total 12 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   tweet_id                  298492 non-null  float64
 1   verbs                     298492 non-null  string 
 2   text_orig                 298492 non-null  object 
 3   text_norm                 298492 non-null  object 
 4   dependencies              298492 non-null  string 
 5   lemma_pos_stopword        298492 non-null  string 
 6   retweet_reply_like_quote  298492 non-null  object 
 7   created_at                298492 non-null  object 
 8   user_id                   298492 non-null  float64
 9   tweet_place_id            298491 non-null  object 
 10  mentions                  185057 non-null  object 
 11  referenced_tweets         180174 non-null  object 
dtypes: float64(2), object(7), string(3)
memory usage: 29.6+ MB


In [45]:
utils.save_csv(es_save_path, merged, 'tweets-processed-combined')

### Breaking Up by Verb

In [12]:
importlib.reload(utils)

<module 'twitter_connection.util.utils' from '/home/rimov/Documents/Code/NLP/lin-que-dropping/processing/../twitter-connection/util/utils.py'>

merged = utils.get_csv(es_save_path/'tweets-processed-combined.csv')
merged.info()

In [47]:
merged['verbs'] = merged['verbs'].str.split(', ')
merged.head(3)

Unnamed: 0,tweet_id,verbs,text_orig,text_norm,dependencies,lemma_pos_stopword,retweet_reply_like_quote,created_at,user_id,tweet_place_id,mentions,referenced_tweets
0,1.451193e+18,"[sentir, pedir]",".@CitroenEspana Cactus con 5,5 años. Me empiez...",".Cactus con 5,5 anos. Me empieza a salir oxido...",".Cactus(ROOT) con(case) 5,5(nummod) anos(nmod)...",.Cactus(.Cactus|PROPN|False) con(con|ADP|True)...,"(1, 1, 1, 0)",2021-10-21 14:26:05+00:00,397960900.0,731c9d11275a5436,"[{'start': 1, 'end': 15, 'username': 'CitroenE...",
1,1.451193e+18,[sentir],"Me toy bebiendo un té, y siento como que toy s...","Me toy bebiendo un te, y siento como que toy s...",Me(iobj) toy(ROOT) bebiendo(xcomp) un(det) te(...,Me(yo|PRON|True) toy(tar|VERB|False) bebiendo(...,"(0, 1, 0, 0)",2021-10-21 14:24:35+00:00,1.238228e+18,01fcc4a23f17e1ed,,
2,1.451192e+18,[sentir],El problema más grave que tiene hoy el Maestro...,El problema mas grave que tiene hoy el Maestro...,El(det) problema(nsubj) mas(advmod) grave(amod...,El(el|DET|True) problema(problema|NOUN|False) ...,"(0, 0, 0, 0)",2021-10-21 14:22:15+00:00,234931200.0,01d487de3c4e0807,,


In [52]:
for verb in es_verbs:
    vtype = es_conjugs.loc[es_conjugs['verb']==verb, 'verb_type'].iloc[0].lower()
    has_verb = merged['verbs'].apply(lambda verbs: True if verb in set(verbs) else False)

    df = merged[has_verb].copy()
    
    path = utils.get_save_path('p', lang='es')/processed_folder/vtype
    
    utils.make_dir(path)
    utils.save_excel(path, df, f'twitter-es-{verb}-26-07-2021')

### Extracting emojis
I believe emojis play a role in language ambiguity, so is necessary to save them in preparation for the lemmatization, which will either cause errors or remove them.

In [49]:
# !pip install emoji --upgrade

import emoji

In [50]:
def extract_emoji(tweets):
    ts = []
    emojis = []
    
    for tweet in tweets:
        words = ''
        
        for word in tweet.split():
            # Read as unicode chars
            trans = regex.findall(r'\X', word)
            w = ''
        
            for c in trans:
                if (c in emoji.UNICODE_EMOJI_ENGLISH) or (c=='❤️'):
                    emojis.append(c)
                    continue
                
                w+=c
            
            if len(w) != 0:
                words = words + ' ' + w
        
        ts.append(words)
    
    return {'text':ts, 'emojis':emojis}

In [51]:
# Converts a series with above 'text', 'emojis' dicts into a df
split_en = pd.DataFrame(
    list(en_text.apply(extract_emoji)))
split_ru = pd.DataFrame(
    list(ru_text.apply(extract_emoji)))

display(split_en.head())
display(split_ru.head())

Unnamed: 0,text,emojis
0,[ Fabulous Leadership!],"[🇺🇲, 🇺🇲, 🇺🇲, 😊, 😊]"
1,[ America is draining itself. I had expected B...,[]
2,[ Glad you are enjoying the beautiful South West],[😍]
3,[ And the long anticipated badly needed spanki...,[]
4,[ I stand with you. Thank you for all you do.],[]


Unnamed: 0,text,emojis
0,[ Не выносите себе сами приговор!)],[]
1,[ добрый вечер Радий! (в Италии тоже кто в кур...,[]
2,[ Вот это я понимаю - хорошие новости!],[]
3,"[ Настоящий глава, не то что некоторые!☝️]",[✊]
4,[ Ходить к мерзким людям на мерзкие (теле)пере...,[]


In [52]:
en_text = en_text.to_frame().reset_index().drop(columns='text').join(split_en)
ru_text = ru_text.to_frame().reset_index().drop(columns='text').join(split_ru)
display(en_text.head(3))
display(ru_text.head(3))

Unnamed: 0,author_id,text,emojis
0,1003024039621820417,[ Fabulous Leadership!],"[🇺🇲, 🇺🇲, 🇺🇲, 😊, 😊]"
1,1005180474,[ America is draining itself. I had expected B...,[]
2,1006460630692450304,[ Glad you are enjoying the beautiful South West],[😍]


Unnamed: 0,author_id,text,emojis
0,1005065863060475904,[ Не выносите себе сами приговор!)],[]
1,1012429847531065349,[ добрый вечер Радий! (в Италии тоже кто в кур...,[]
2,1013485198787440640,[ Вот это я понимаю - хорошие новости!],[]


In [53]:
en_text = pd.concat(
    [en_text,
     en_text.loc[:, 'emojis'].apply(len).rename('emoji_len')], 
    axis=1, join='inner')
ru_text = pd.concat(
    [ru_text,
     ru_text.loc[:, 'emojis'].apply(len).rename('emoji_len')], 
    axis=1, join='inner')

In [54]:
display(en_text.sort_values('emoji_len', ascending=False))
display(ru_text.sort_values('emoji_len', ascending=False))

Unnamed: 0,author_id,text,emojis,emoji_len
439,501659753,"[ Thank you, you too, Aww bless you thank you...","[🥰, 😘, 🥰, 😘, 🥰, 😘, 😘, 🥰, 😘, 🥰, 🥰, 😘, ❤️, 🥰, 😘,...",23
89,1241523214028685313,[ Well we know project veritas uncovered cnn a...,"[😲, 🤯, 🤯, 🤯, 🤯, 🤯, 🤯, 🤯, 🤯, 🤯, 🤯, 🤯, 🤯, 🤯, 🤯, ...",17
302,2231639652,[ The Scrabble Champion Of The House Then !! H...,"[😉, 😉, 💋, ❤️, 💋, 🤗, 🤗, 🤗, ❤️, 💋, 🤗, 🤗, 🤗, ❤️, ❤️]",15
6,1017486629861289984,[ Happy Thursday! Jammin' job Champ Including ...,"[🎊, 💪, 👍, 👊, 🏆, ❗, 🎊, 💕, ❤, 😘, 😊]",11
343,2840137376,"[ Recover your hacked, disabled or locked acco...","[📡, 📌, 📌, 📌, 📌, 📌, 📌, 📌, 📌, 📌, 👌]",11
...,...,...,...,...
57,1188279481879932928,[ You only have to do a little bit of research...,[],0
245,1378070190949330948,"[ don't know what the point of this is, but al...",[],0
58,1188879217552814082,[ Maybe u ought to get ur hands on a copy of t...,[],0
59,119212393,[ If you're an American scared of Russia or Ch...,[],0


Unnamed: 0,author_id,text,emojis,emoji_len
131,1325355166057566209,"[ Ну, трава тож разная бывает!, Да ладно! Не ...","[😉, 😁, ☝, 😉, 😁, 😉, 😁, ☝, 🤔]",9
53,1183942687885533184,"[ В Питере хоть дождь спасает положение, Согл...","[😉, 🤗, 😉, 😉, 😉, 😉, 🤝, 👋🏽]",8
78,1241741410954600448,[ Ты почитай комменты выше сашок. Поголовно ка...,"[🤣, 🤣, 🤣, 😂, 😂, 😂]",6
336,900730534002925569,"[ Не, я лучше одну возьму, но что бы и пригото...","[🤣, 🤣, 🤣, 🤣, 🤣]",5
317,809880868567261184,"[ Нет.Поверь ветерану движения., Это жестокий...","[😂, 🤔, 😂, 💐, 😘]",5
...,...,...,...,...
127,1317141671264571395,"[ Нет. На эти, с примерно таким же сервисом - ...",[],0
126,1315635580533911552,[ Шпионил за процветающими рэспубликами и выве...,[],0
125,1312337640,[ Захотела решить выйти...))],[],0
124,1308082843621511169,[ в питере полиция задержала-было уже?],[],0


### Extracting Superfluous Characters
Research suggests superfluous characters (repeated letters, non-standard punctuation) convey important cues in texts. Will keep track of specific punctuation (.!?). Furthermore, noticed that some mentions (@xxxxx) have gotten through and need to clean.

# TODO: DESCRIBE IS SKEWED BY 0s, ADJUST CALCS IF USING STATS

In [55]:
en_df = en_text.copy()
ru_df = ru_text.copy()

In [56]:
# All text to lowercase
en_df.loc[:, 'text'] = en_df.loc[:, 'text'].apply(lambda x: [w.lower() for w in x])
ru_df.loc[:, 'text'] = ru_df.loc[:, 'text'].apply(lambda x: [w.lower() for w in x])

In [57]:
# Want to keep track of amount of tweets per user
en_df = en_df.join(en_df.loc[:, 'text'].map(len).rename('amt_tweets'), how='inner')
ru_df = ru_df.join(ru_df.loc[:, 'text'].map(len).rename('amt_tweets'), how='inner')

display(en_df.sort_values('amt_tweets', ascending=False).head())
display(ru_df.sort_values('amt_tweets', ascending=False).head())

Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets
82,1232360804004945921,[ same! i need to read through the proposed in...,[],0,18
193,1349182046569181185,[ actually bro spanish flu is the h1n1 virus. ...,[],0,11
439,501659753,"[ thank you, you too, aww bless you thank you...","[🥰, 😘, 🥰, 😘, 🥰, 😘, 😘, 🥰, 😘, 🥰, 🥰, 😘, ❤️, 🥰, 😘,...",23,10
371,3140037589,[ but it’s still pretty rare isn’t it ? i’ve n...,[],0,8
240,1377312703992238088,[ looking totally stunning wend wow .lovely wh...,[],0,8


Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets
344,950066462588391425,[ читатели наших блогов уже давно прекрасно зн...,[],0,13
220,2638582010,"[ гордишься чмошеством? бывает...)), офицеры ...",[],0,9
81,1243254820385034243,"[ и что дальше? госрегулирование включат?, те...",[],0,9
83,1245394688447844353,"[ божэ, какое говнище..., а вы что за хуй? от...",[],0,9
251,3577303277,[ ❗️наташе тышкевич назначили запрет определен...,"[🤷‍♀️, 🤷‍♀️, 🤷‍♀️]",3,8


In [58]:
count_punct_pat = r'([.!?]+)'
remove_punct_pat = r'[^\s\w]+'
rep_lett_pat = r'(\w)\1{2,}'

In [59]:
def count_punct(tweets):
    punct = 0
    punct_groups = 0
    
    for tweet in tweets:
        for g in regex.findall(count_punct_pat, tweet):
            if len(g)==0:
                continue
                
            punct_groups += 1
            punct += len(g)
    
    return punct, float(punct)/punct_groups if punct_groups != 0 else 0

In [60]:
def count_rep_char(tweets):
    char = ''
    
    for tweet in tweets:
        for c in regex.findall(rep_lett_pat, tweet):
            char += c
    
    return len(char)

In [61]:
def extract_punct(tweets):
    
    for i, tweet in enumerate(tweets):
        for g in regex.findall(remove_punct_pat, tweet):
            tweet = tweet.replace(g, '')
            
            if len(tweet)==0:
                del tweets[i]
            else:
                tweets[i] = tweet
                
    return tweets

In [62]:
display(en_df.loc[:, 'text'].head(5))

0                              [ fabulous leadership!]
1    [ america is draining itself. i had expected b...
2    [ glad you are enjoying the beautiful south west]
3    [ and the long anticipated badly needed spanki...
4       [ i stand with you. thank you for all you do.]
Name: text, dtype: object

In [63]:
# Adding columns for punctutations, average punctuations, repeat letters
en_df = pd.concat(
    [en_df, 
     pd.DataFrame(en_df.loc[:, 'text'].map(count_punct).to_list(), columns=['puncts', 'avg_puncts']),
     en_df.loc[:, 'text'].map(count_rep_char).rename('repeat_letters')], 
    axis=1, join='inner')
ru_df = pd.concat(
    [ru_df, 
     pd.DataFrame(ru_df.loc[:, 'text'].map(count_punct).to_list(), columns=['puncts', 'avg_puncts']),
     ru_df.loc[:, 'text'].map(count_rep_char).rename('repeat_letters')], 
    axis=1, join='inner')

display(en_df.head())
display(ru_df.head())

Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters
0,1003024039621820417,[ fabulous leadership!],"[🇺🇲, 🇺🇲, 🇺🇲, 😊, 😊]",5,1,1,1.0,0
1,1005180474,[ america is draining itself. i had expected b...,[],0,1,1,1.0,0
2,1006460630692450304,[ glad you are enjoying the beautiful south west],[😍],1,1,0,0.0,0
3,100680355,[ and the long anticipated badly needed spanki...,[],0,1,3,3.0,0
4,1011442510869114880,[ i stand with you. thank you for all you do.],[],0,1,2,1.0,0


Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters
0,1005065863060475904,[ не выносите себе сами приговор!)],[],0,1,1,1.0,0
1,1012429847531065349,[ добрый вечер радий! (в италии тоже кто в кур...,[],0,2,2,1.0,0
2,1013485198787440640,[ вот это я понимаю - хорошие новости!],[],0,1,1,1.0,0
3,1014106378355519488,"[ настоящий глава, не то что некоторые!☝️]",[✊],1,1,1,1.0,0
4,1020634776103944192,[ ходить к мерзким людям на мерзкие (теле)пере...,[],0,3,4,1.0,0


In [64]:
# Removing punctuation from text to normalize a bit
en_df.loc[:, 'text'] = en_df.loc[:, 'text'].map(extract_punct)
ru_df.loc[:, 'text'] = ru_df.loc[:, 'text'].map(extract_punct)

display(en_df.head(3))
display(ru_df.head(3))

Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters
0,1003024039621820417,[ fabulous leadership],"[🇺🇲, 🇺🇲, 🇺🇲, 😊, 😊]",5,1,1,1.0,0
1,1005180474,[ america is draining itself i had expected bi...,[],0,1,1,1.0,0
2,1006460630692450304,[ glad you are enjoying the beautiful south west],[😍],1,1,0,0.0,0


Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters
0,1005065863060475904,[ не выносите себе сами приговор],[],0,1,1,1.0,0
1,1012429847531065349,[ добрый вечер радий в италии тоже кто в куртк...,[],0,2,2,1.0,0
2,1013485198787440640,[ вот это я понимаю хорошие новости],[],0,1,1,1.0,0


In [65]:
display(en_df.loc[:, 'text'].head(5))

0                               [ fabulous leadership]
1    [ america is draining itself i had expected bi...
2    [ glad you are enjoying the beautiful south west]
3    [ and the long anticipated badly needed spanki...
4         [ i stand with you thank you for all you do]
Name: text, dtype: object

### Lemmatization

In [66]:
# !pip install pymorphy2
# !pip install -U pymorphy2-dicts-ru

# nltk.download('wordnet')

In [67]:
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import pymorphy2

In [68]:
en_lem = WordNetLemmatizer()
ru_lem = pymorphy2.MorphAnalyzer()

#### Tokenizing and POS tagging
Part of speech tagging for EN tweets, as the lemmatizer doesn't do so automatically unlike pymorphy for RU

In [69]:
en_lemmad = en_df.copy()
ru_lemmad = ru_df.copy()

In [70]:
def en_set_pos_tag(tweets):
    ts = []
    
    for tweet in tweets:
        ts.extend(
            nltk.pos_tag(
                nltk.word_tokenize(tweet)))
    
    return ts

In [71]:
def ru_tokenize(tweets):
    tokenized = []
    
    for tweet in tweets:
        tokenized.extend(nltk.tokenize.word_tokenize(tweet, language='russian'))
        
    return tokenized

In [72]:
ru_lemmad.loc[:, 'text'] = ru_lemmad.loc[:, 'text'].map(ru_tokenize)
en_lemmad.loc[:, 'text'] = en_lemmad.loc[:, 'text'].map(en_set_pos_tag)

display(en_lemmad.head(5))
display(ru_lemmad.head(5))

Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters
0,1003024039621820417,"[(fabulous, JJ), (leadership, NN)]","[🇺🇲, 🇺🇲, 🇺🇲, 😊, 😊]",5,1,1,1.0,0
1,1005180474,"[(america, NN), (is, VBZ), (draining, VBG), (i...",[],0,1,1,1.0,0
2,1006460630692450304,"[(glad, NN), (you, PRP), (are, VBP), (enjoying...",[😍],1,1,0,0.0,0
3,100680355,"[(and, CC), (the, DT), (long, JJ), (anticipate...",[],0,1,3,3.0,0
4,1011442510869114880,"[(i, JJ), (stand, VBP), (with, IN), (you, PRP)...",[],0,1,2,1.0,0


Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters
0,1005065863060475904,"[не, выносите, себе, сами, приговор]",[],0,1,1,1.0,0
1,1012429847531065349,"[добрый, вечер, радий, в, италии, тоже, кто, в...",[],0,2,2,1.0,0
2,1013485198787440640,"[вот, это, я, понимаю, хорошие, новости]",[],0,1,1,1.0,0
3,1014106378355519488,"[настоящий, глава, не, то, что, некоторые️]",[✊],1,1,1,1.0,0
4,1020634776103944192,"[ходить, к, мерзким, людям, на, мерзкие, телеп...",[],0,3,4,1.0,0


#### Lemmatization

In [73]:
# Map of the appropriate POS tags to pass into the lemmatizer
pos_tag = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}

In [74]:
def lemmatize(tweet, lang):
    if lang=='ru':
        return [ru_lem.parse(w)[0].normal_form for w in tweet]
    
    lemm = []
    
    for w in tweet:
        word = w[0]
        pos = w[1][0]
        
        try:
            lemm.append(en_lem.lemmatize(word, pos_tag.get(pos)))
        except KeyError:
            continue
        
    return lemm

In [75]:
en_lemmad.loc[:, 'text'] = en_lemmad.loc[:, 'text'].apply(lemmatize, lang='en')
ru_lemmad.loc[:, 'text'] = ru_lemmad.loc[:, 'text'].apply(lemmatize, lang='ru')

In [76]:
display(en_lemmad.head(5))
display(ru_lemmad.head(5))

Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters
0,1003024039621820417,"[fabulous, leadership]","[🇺🇲, 🇺🇲, 🇺🇲, 😊, 😊]",5,1,1,1.0,0
1,1005180474,"[america, be, drain, i, have, expect, biden, d...",[],0,1,1,1.0,0
2,1006460630692450304,"[glad, be, enjoy, beautiful, south, west]",[😍],1,1,0,0.0,0
3,100680355,"[long, anticipated, badly, need, spanking, begin]",[],0,1,3,3.0,0
4,1011442510869114880,"[i, stand, thank, do]",[],0,1,2,1.0,0


Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters
0,1005065863060475904,"[не, выносить, себя, сам, приговор]",[],0,1,1,1.0,0
1,1012429847531065349,"[добрый, вечер, радий, в, италия, тоже, кто, в...",[],0,2,2,1.0,0
2,1013485198787440640,"[вот, это, я, понимать, хороший, новость]",[],0,1,1,1.0,0
3,1014106378355519488,"[настоящий, глава, не, то, что, некоторые️]",[✊],1,1,1,1.0,0
4,1020634776103944192,"[ходить, к, мерзкий, человек, на, мерзкий, тел...",[],0,3,4,1.0,0


#### Removing stopwords
Stop words were needed for sentence analysis in POS tagging

In [77]:
from nltk.corpus import stopwords

en_stopwords = set(stopwords.words('english'))
ru_stopwords = set(stopwords.words('russian'))

In [78]:
len(en_stopwords)

179

In [79]:
len(ru_stopwords)

151

In [80]:
# Add apostrophe-less variations of stopwords for english
words = set()

for word in en_stopwords:
    words.add(word.replace('\'', ''))

en_stopwords.update(words)

In [81]:
# Adding 'ё' word-variations, which are missing from the stopwords here
words = {'всё', 'ещё', 'её'}

ru_stopwords.update(words)

In [82]:
# Counting and removing stopwords
def extract_stop(tweet, lang):
    count = 0
    cleaned = []
    
    cleaned = [word for word in tweet if word not in en_stopwords]\
        if lang=='en'\
        else [word for word in tweet if word not in ru_stopwords]
    
    count = len(tweet) - len(cleaned) 
    return (cleaned, count)

In [83]:
en_cleaned = pd.DataFrame(en_lemmad.loc[:, 'text'].apply(extract_stop, lang='en').to_list(), columns=['text', 'stopwords'])
ru_cleaned = pd.DataFrame(ru_lemmad.loc[:, 'text'].apply(extract_stop, lang='ru').to_list(), columns=['text', 'stopwords'])

display(en_cleaned.head(3))
display(ru_cleaned.head(3))

Unnamed: 0,text,stopwords
0,"[fabulous, leadership]",0
1,"[america, drain, expect, biden, good]",4
2,"[glad, enjoy, beautiful, south, west]",1


Unnamed: 0,text,stopwords
0,"[выносить, приговор]",3
1,"[добрый, вечер, радий, италия, куртка, коротки...",10
2,"[это, понимать, хороший, новость]",2


In [84]:
en_lemmad.loc[:, 'text'] = en_cleaned.loc[:, 'text']
ru_lemmad.loc[:, 'text'] = ru_cleaned.loc[:, 'text']

en_lemmad = pd.concat(
    [en_lemmad,
     en_cleaned.loc[:, 'stopwords']], 
    axis=1, join='inner')
ru_lemmad = pd.concat(
    [ru_lemmad,
     ru_cleaned.loc[:, 'stopwords']], 
    axis=1, join='inner')

display(en_lemmad.head(3))
display(ru_lemmad.head(3))

Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters,stopwords
0,1003024039621820417,"[fabulous, leadership]","[🇺🇲, 🇺🇲, 🇺🇲, 😊, 😊]",5,1,1,1.0,0,0
1,1005180474,"[america, drain, expect, biden, good]",[],0,1,1,1.0,0,4
2,1006460630692450304,"[glad, enjoy, beautiful, south, west]",[😍],1,1,0,0.0,0,1


Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters,stopwords
0,1005065863060475904,"[выносить, приговор]",[],0,1,1,1.0,0,3
1,1012429847531065349,"[добрый, вечер, радий, италия, куртка, коротки...",[],0,2,2,1.0,0,10
2,1013485198787440640,"[это, понимать, хороший, новость]",[],0,1,1,1.0,0,2


### Final Analysis

In [85]:
def get_text_info(text):
    words = len(text)
    chars = 0
    
    for word in text:
        chars += len(word)
    
    return words, chars

In [86]:
en_lengths = pd.DataFrame(en_lemmad.loc[:, 'text'].map(get_text_info).to_list(), columns=['amt_words', 'amt_chars'])
ru_lengths = pd.DataFrame(ru_lemmad.loc[:, 'text'].map(get_text_info).to_list(), columns=['amt_words', 'amt_chars'])

display(en_lengths.sort_values('amt_words', ascending=False).head())
display(ru_lengths.sort_values('amt_words', ascending=False).head())

Unnamed: 0,amt_words,amt_chars
82,351,2057
371,137,730
86,105,634
74,95,465
27,94,439


Unnamed: 0,amt_words,amt_chars
344,198,1406
41,156,961
81,122,819
251,89,623
139,87,571


In [43]:
en_final = en_lemmad.join(en_lengths, how='left')
ru_final = ru_lemmad.join(ru_lengths, how='left')

display(en_final.sort_values('amt_tweets', ascending=False).head())
display(ru_final.sort_values('amt_tweets', ascending=False).head())

Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters,stopwords,amt_words,amt_chars
82,1232360804004945921,"[need, read, propose, instruction, try, figure...",[],0,18,76,1.0,0,100,351,2057
193,1349182046569181185,"[actually, bro, spanish, flu, h1n1, virus, shi...",[],0,11,17,1.133333,0,23,83,374
439,501659753,"[thank, aww, bless, thank, even, thank, even, ...","[🥰, 😘, 🥰, 😘, 🥰, 😘, 😘, 🥰, 😘, 🥰, 🥰, 😘, ❤️, 🥰, 😘,...",23,10,1,1.0,0,14,37,163
371,3140037589,"[still, pretty, rare, ive, never, even, heard,...",[],0,8,27,1.0,0,35,137,730
240,1377312703992238088,"[look, totally, stun, wend, wow, lovely, wheat...",[],0,8,7,1.0,0,13,94,480


Unnamed: 0,author_id,text,emojis,emoji_len,amt_tweets,puncts,avg_puncts,repeat_letters,stopwords,amt_words,amt_chars
344,950066462588391425,"[читатель, наш, блог, давно, прекрасно, знать,...",[],0,13,19,1.0,0,163,198,1406
220,2638582010,"[гордиться, чмошество, бывать, офицер, казарма...",[],0,9,42,2.8,1,33,68,480
81,1243254820385034243,"[далёкий, госрегулирование, включить, тест, ко...",[],0,9,31,1.192308,0,89,122,819
83,1245394688447844353,"[божэ, говнищий, хуй, откуда, взяться, писать,...",[],0,9,16,1.333333,0,26,43,251
251,3577303277,"[️наташа, тышкевич, назначить, запрет, определ...","[🤷‍♀️, 🤷‍♀️, 🤷‍♀️]",3,8,6,1.0,0,28,89,623


In [44]:
# Saving
with open('en_processed.txt', 'w') as d:
    d.writelines(en_final.to_json(orient='table', force_ascii=False))
    
with open('ru_processed.txt', 'w') as d:
    d.writelines(ru_final.to_json(orient='table', force_ascii=False))