# Vectoring Tweets

In [145]:
import pandas as pd
import numpy as np
import io
from googletrans import Translator, LANGUAGES
from time import sleep
import string

## Embedding dict

In [146]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

**Select Languages**

In [147]:
languages = ["bulgarian","english"]

In [148]:
df

Unnamed: 0,mep_id,name,country,group,nat_group,twitter,content
300,189525,Asim ADEMOV,Bulgaria,Group of the European People's Party (Christia...,Citizens for European Development of Bulgaria,AdemovAsim,135 години единна и силна България. \nЧестит п...
301,189525,Asim ADEMOV,Bulgaria,Group of the European People's Party (Christia...,Citizens for European Development of Bulgaria,AdemovAsim,📢 Остана 1⃣ седмица! Предимствата да станете с...
302,189525,Asim ADEMOV,Bulgaria,Group of the European People's Party (Christia...,Citizens for European Development of Bulgaria,AdemovAsim,-@AdemovAsim ГЕРБ/ЕНП: Познавайки добре обстан...
303,189525,Asim ADEMOV,Bulgaria,Group of the European People's Party (Christia...,Citizens for European Development of Bulgaria,AdemovAsim,Въпреки че се появиха неканени на срещата на Г...
304,189525,Asim ADEMOV,Bulgaria,Group of the European People's Party (Christia...,Citizens for European Development of Bulgaria,AdemovAsim,Д-р Даниела Дариткова е прекрасен Човек и изкл...
...,...,...,...,...,...,...,...
116554,124852,Sergei STANISHEV,Bulgaria,Group of the Progressive Alliance of Socialist...,Bulgarian Socialist Party,SergeiStanishev,We unveil our #WesternBalkans strategy which i...
116555,124852,Sergei STANISHEV,Bulgaria,Group of the Progressive Alliance of Socialist...,Bulgarian Socialist Party,SergeiStanishev,Journalist: Will the European Parliament stick...
116556,124852,Sergei STANISHEV,Bulgaria,Group of the Progressive Alliance of Socialist...,Bulgarian Socialist Party,SergeiStanishev,"""We are advocating for #TransnationalLists and..."
116557,124852,Sergei STANISHEV,Bulgaria,Group of the Progressive Alliance of Socialist...,Bulgarian Socialist Party,SergeiStanishev,#Croatia PM @AndrejPlenkovic in #EPlenary toda...


**Select path to languages embedding txt**

In [149]:
nmax = 100000  # maximum number of word embeddings to load
emb_dict = {}
for lang in languages:
    path = f"../../raw_data/vectors_{lang}.txt" #Select here
    embeddings, id2word, word2id = load_vec(path, nmax)
    emb_dict[lang] = [embeddings, id2word, word2id]

In [150]:
def multilang_word_vector(word, emb_dict, lang=None):
    translator = Translator()
    if lang == None: lang = LANGUAGES[translator.detect(word).lang]
    lang_val = LANGUAGES.values()
    if lang in lang_val and lang in emb_dict.keys():
        if word in emb_dict.get(lang)[2].keys():
            return emb_dict[lang][0][emb_dict[lang][2][word]]
    return False

In [151]:
def vect_tweet(tweet):
    print(f"{nw_df.content[nw_df.content == tweet].index[0]/len(nw_df.content)*100:.2f}%")
    translator = Translator()
    if translator.detect(tweet).lang in LANGUAGES.keys():
        lang = LANGUAGES[translator.detect(tweet).lang]
        words = tweet.split(" ")
        res = []
        for i in words:
            res.append(multilang_word_vector(i, emb_dict, lang))
        return res
    return None

**Select path to df**

In [152]:
df_full = pd.read_pickle("../../delphes/data/extended_tweet_df")

In [153]:
df_full["country"].unique()

array(['Poland', 'Bulgaria', 'Italy', 'Spain', 'Finland', 'Sweden',
       'Latvia', 'Germany', 'Greece', 'Luxembourg', 'Estonia', 'Belgium',
       'Romania', 'France', 'Denmark', 'Lithuania', 'Netherlands',
       'Slovakia', 'Hungary', 'Slovenia', 'Croatia', 'Portugal', 'Malta',
       'Cyprus', 'Ireland', 'Czechia', 'Austria'], dtype=object)

**Select countries including the first value**

In [154]:
df = df_full[df_full["country"] == "Bulgaria"]
#for i in ["Denmark"]:
    #df = pd.concat([df, df_full[df_full["country"] == i]])

In [155]:
df.country.unique()

array(['Bulgaria'], dtype=object)

## Data clean

In [156]:
def rmurl_df(df, column_name):
    '''
    This function removes all the URLs, the #hashtag and the @user of a column made of strings.
    Be careful to apply it BEFORE all the other preprocessing steps (if not it wont'
    be recognized as a URL)
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    return df

In [157]:
def lower_df(df, column_name):
    '''
    This function lowercases a column made of strings and return the dataframe.
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.lower()
    return df

In [158]:
def rmnumbers_df(df, column_name):
    '''
    This function removes all the digits of a column made of strings.
    '''
    df = df.copy()
    def remove_numbers(text):
        return ''.join(word for word in text if not word.isdigit())
    df[column_name] = df[column_name].apply(remove_numbers)
    return df

In [159]:
def rmpunct_df(df, column_name):
    '''
    This function removes all the punctuations, all the "rt" and remove multiple spaces
    of a column made of strings.
    '''
    punct = string.punctuation
    df = df.copy()
    def replace_punct(text):
        for punctu in punct:
            text = text.replace(punctu, ' ')
            text = text.replace(' rt ','')
            text = " ".join(text.split())
        return text
    df[column_name] = df[column_name].apply(replace_punct)
    return df

In [160]:
def rmemojis_df(df):
    '''
    This function removes all the emojis of a column made of strings.
    Be careful to translate in latin alphabet before applying this function :
    it also removes cyrillic alphabet.
    '''
    df = df.copy()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    return df

In [161]:
nw_df = rmurl_df(df, "content")
nw_df = lower_df(nw_df, "content")
nw_df = rmnumbers_df(nw_df, "content")
nw_df = rmpunct_df(nw_df, "content")
nw_df = rmemojis_df(nw_df)

## Vector

In [162]:
emb_dict.keys()

dict_keys(['bulgarian', 'english'])

In [163]:
nw_df.content[2:3]

302                                          
Name: content, dtype: object

In [None]:
vecs = nw_df.content[:].map(vect_tweet)

11.38%
11.42%
11.46%
11.49%
11.53%
11.57%
11.61%
11.65%
11.68%
11.72%
11.76%
11.80%
11.42%
11.87%
11.91%
11.95%
11.99%
12.03%
12.06%
12.10%
12.14%
11.42%
12.22%
12.25%
11.87%
11.68%
12.37%
11.87%
12.44%
12.22%
12.52%
11.42%
12.59%
12.63%
12.67%
12.71%
12.75%
12.78%
12.37%
12.86%
12.90%
12.94%
12.97%
11.38%
13.05%
11.46%
12.59%
13.16%
13.20%
13.24%
13.28%
12.06%
13.35%
13.39%
11.53%
13.47%
13.51%
13.54%
13.58%
13.62%
13.66%
13.69%
11.72%
11.68%
13.81%
13.85%
11.80%
13.92%
13.96%
12.22%
14.04%
11.76%
14.11%
14.15%
14.19%
13.58%
14.26%
14.30%
11.42%
14.38%
14.42%
11.87%
12.06%
14.53%
11.42%
14.61%
14.61%
12.22%
14.15%
12.22%
11.57%
13.24%
11.46%
12.10%
13.69%
14.98%
15.02%
12.22%
11.76%
13.96%
13.51%
13.24%
15.25%
15.29%
12.22%
13.24%
12.22%
12.06%
13.51%
14.19%
12.22%
13.39%
12.06%
13.92%
11.53%
11.76%
14.53%
12.06%
11.53%
12.59%
14.15%
11.87%
13.35%
14.61%
16.08%
12.10%
12.37%
16.20%
16.24%
11.87%
12.59%
11.87%
11.87%
12.37%
13.51%
11.46%
12.06%
11.95%
16.62%
16.65%
16.69%
11.57%
13.96%

18.78%
2580.99%
14.53%
2581.07%
2581.11%
1341.16%
2581.18%
2581.22%
2581.26%
2581.30%
2581.34%
2581.37%
2581.41%
2581.45%
14.26%
2581.53%
2581.56%
2581.60%
2581.64%
2581.68%
2581.71%
2581.75%
2581.79%
2581.83%
2581.87%
2581.90%
18.78%
14.26%
2582.02%
1341.16%
18.78%
15.29%
11.38%
15.29%
1341.16%
2582.28%
2582.32%
20.11%
2582.40%
2582.44%
13.05%
13.69%
2582.55%
2582.59%
2582.63%
2582.66%
14.19%
11.80%
2582.78%
2582.81%
2582.85%
2582.89%
2582.93%
20.30%
13.24%
18.78%
13.05%
2583.12%
2583.16%
14.53%
2583.23%
13.05%
2583.31%
2583.35%
2583.38%
2583.42%
2583.46%
2583.50%
2583.54%
14.53%
20.11%
2583.65%
13.39%
2583.73%
14.19%
1341.16%
2663.51%
11.57%
2663.58%
2663.62%
2663.66%
2663.69%
2663.73%
2663.77%
2663.81%
2663.85%
19.23%
2663.92%
2663.96%
1339.61%
2664.04%
2664.07%
2664.11%
2664.15%
2664.19%
2664.23%
2664.26%
17.64%
2664.34%
2664.38%
2664.42%
2664.45%
2664.49%
2664.53%
2664.57%
2664.61%
2664.64%
2664.68%
13.24%
2664.76%
2664.80%
2664.83%
2664.87%
2664.91%
2664.95%
2664.98%
2665.02%
266

3475.83%
3475.87%
3475.91%
18.78%
3475.99%
3476.02%
3476.06%
3476.10%
3476.14%
2574.28%
3476.21%
3476.25%
12.22%
3476.33%
3476.37%
2576.37%
3476.44%
2575.00%
3476.52%
3476.56%
3476.59%
3476.63%
3476.67%
12.22%
3476.75%
3476.78%
3476.82%
3476.86%
3476.90%
3476.93%
3476.97%
3477.01%
3477.05%
3477.09%
3477.12%
3477.16%
3477.20%
3477.24%
3477.28%
3477.31%
3477.35%
3477.39%
3477.43%
3477.47%
3477.50%
3797.95%
11.72%
3798.03%
3798.07%
12.59%
12.59%
3798.18%
3798.22%
1341.58%
11.95%
3798.33%
3798.37%
12.59%
3798.44%
3798.48%
13.35%
11.46%
11.76%
3798.63%
1341.16%
17.64%
12.59%
12.59%
14.61%
15.29%
11.53%
11.72%
11.42%
1340.59%
3799.05%
3799.09%
13.24%
12.37%
14.61%
2575.00%
3799.28%
3799.32%
13.92%
3799.39%
11.42%
11.76%
3799.51%
3472.91%
3799.58%
3799.62%
3799.66%
3799.70%
3799.73%
3799.77%
3799.81%
3799.85%
11.76%
13.69%
3799.96%
2577.62%
13.58%
13.58%
13.35%
13.24%
3800.19%
12.37%
11.95%
11.72%
3800.34%
3800.38%
3800.42%
12.37%
3800.49%
11.46%
13.69%
3800.61%
3800.64%
3800.68%
3800.72%
11.

In [122]:
vecs.to_pickle("czech")

In [44]:
#vecs.to_csv("dutch_danish.csv")

In [21]:
translator = Translator()
LANGUAGES[translator.detect('moramo znati kakve proizvode kupujemo').lang]

'croatian'

In [47]:
data_de_ouf = pd.read_csv("dutch_danish.csv")

In [51]:
data_de_ouf.columns

Index(['Unnamed: 0', 'content'], dtype='object')

In [53]:
data_de_ouf.set_index("Unnamed: 0",inplace=True)

In [54]:
data_de_ouf[""]

Unnamed: 0_level_0,content
Unnamed: 0,Unnamed: 1_level_1
5676,"[array([-6.17971e-02, -7.04935e-02, -2.45169e-..."
5677,"[array([ 2.50098e-02, -5.89277e-03, 6.28855e-..."
5678,"[array([-0.0324474 , -0.0462027 , -0.00872643,..."
5679,"[array([-4.48872e-02, 2.14520e-02, -3.36201e-..."
5680,"[array([ 4.40220e-02, -1.75716e-02, 1.35894e-..."
...,...
53735,"[array([-1.06169e-01, -2.88038e-02, -3.92888e-..."
53736,"[array([-8.47695e-03, 2.10797e-02, -3.06245e-..."
53737,"[array([-2.46050e-02, -4.23989e-03, -6.23185e-..."
53738,"[array([ 1.85329e-02, -1.02918e-01, -1.19836e-..."


In [60]:
df_attempt = pd.read_pickle("dutch_danish")

In [81]:
pickle_df = pd.DataFrame(df_attempt)
pickle_df = pickle_df.reset_index()

In [82]:
pickle_df

Unnamed: 0,index,content
0,5676,"[[-0.0617971, -0.0704935, -0.0245169, 0.067761..."
1,5677,"[[0.0250098, -0.00589277, 0.0628855, 0.0705047..."
2,5678,"[[-0.0324474, -0.0462027, -0.00872643, 0.09936..."
3,5679,"[[-0.0448872, 0.021452, -0.0336201, 0.112377, ..."
4,5680,"[[0.044022, -0.0175716, 0.0135894, -0.00417867..."
...,...,...
11225,132290,"[[-0.0347214, -0.0174933, -0.0307816, 0.038113..."
11226,132291,"[[-0.126007, -0.0443534, -0.009703, 0.0663761,..."
11227,132292,"[[-0.0224834, -0.0404983, -0.0590012, 0.023879..."
11228,132293,"[[0.0510281, 0.00867496, -0.00653633, -0.00882..."


In [72]:
pickle_df[pickle_df["content"]=="None"]

Unnamed: 0,content


In [76]:
old_data = pd.read_pickle("../../delphes/data/extended_tweet_df")
old_data = old_data.reset_index()

In [84]:
merger = old_data.merge(pickle_df,on="index")

In [85]:
merger.to_pickle("merger")

In [86]:
merger

Unnamed: 0,index,mep_id,name,country,group,nat_group,twitter,content_x,content_y
0,5076,28161,Margrete AUKEN,Denmark,Group of the Greens/European Free Alliance,Socialistisk Folkeparti,MargreteAuken,Hvornår agerer #Bagmandspolitiet på anmeldelse...,"[False, [0.101227, -0.00355242, -0.0613298, 0...."
1,5077,28161,Margrete AUKEN,Denmark,Group of the Greens/European Free Alliance,Socialistisk Folkeparti,MargreteAuken,“Særligt” ansvar https://t.co/DoXm59xFsc,
2,5078,28161,Margrete AUKEN,Denmark,Group of the Greens/European Free Alliance,Socialistisk Folkeparti,MargreteAuken,Here is the text of our appeal to STOP NORD ST...,"[[-0.0540806, 0.0168135, 0.0596281, 0.0681775,..."
3,5079,28161,Margrete AUKEN,Denmark,Group of the Greens/European Free Alliance,Socialistisk Folkeparti,MargreteAuken,"Amazonas brænder, og skov- og klimakrisen er a...","[[0.00612917, 0.0156791, -0.0486738, 0.0480299..."
4,5080,28161,Margrete AUKEN,Denmark,Group of the Greens/European Free Alliance,Socialistisk Folkeparti,MargreteAuken,Covid-19 vaccine makers lobby the EU for immun...,"[False, [-0.0484475, 0.011017, -0.0737066, 0.0..."
...,...,...,...,...,...,...,...,...,...
11225,133995,5392,Lara WOLTERS,Netherlands,Group of the Progressive Alliance of Socialist...,Partij van de Arbeid,larawoltersEU,A more socially and environmentally sustainabl...,"[[0.0495577, 0.129444, -0.0491547, 0.00429166,..."
11226,133996,5392,Lara WOLTERS,Netherlands,Group of the Progressive Alliance of Socialist...,Partij van de Arbeid,larawoltersEU,https://t.co/CwKFTJMz8Y,[False]
11227,133997,5392,Lara WOLTERS,Netherlands,Group of the Progressive Alliance of Socialist...,Partij van de Arbeid,larawoltersEU,Goed gesprek met @EUombudsman Emily O’Reilly o...,"[[-0.0972181, -0.0736294, 0.00087396, 0.063221..."
11228,133998,5392,Lara WOLTERS,Netherlands,Group of the Progressive Alliance of Socialist...,Partij van de Arbeid,larawoltersEU,@pieterzwaan Ja hoor! Graag naar mijn EP email.,"[[-0.0196253, -0.00134424, -0.035015, -0.01173..."
