# Vectoring Tweets

In [19]:
import pandas as pd
import numpy as np
import io
from googletrans import Translator, LANGUAGES
from time import sleep
import string

## Embedding dict

In [20]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

**Select Languages**

In [21]:
languages = ["english", "spanish"]

**Select path to languages embedding txt**

In [22]:
nmax = 100000  # maximum number of word embeddings to load
emb_dict = {}
for lang in languages:
    path = f"../../raw_data/vectors_{lang}.txt" #Select here
    embeddings, id2word, word2id = load_vec(path, nmax)
    emb_dict[lang] = [embeddings, id2word, word2id]

In [23]:
def multilang_word_vector(word, emb_dict, lang=None):
    translator = Translator()
    if lang == None: lang = LANGUAGES[translator.detect(word).lang]
    lang_val = LANGUAGES.values()
    if lang in lang_val and lang in emb_dict.keys():
        if word in emb_dict.get(lang)[2].keys():
            return emb_dict[lang][0][emb_dict[lang][2][word]]
    return False  

In [24]:
def vect_tweet(tweet):
    translator = Translator()
    if translator.detect(tweet).lang in LANGUAGES.keys():
        lang = LANGUAGES[translator.detect(tweet).lang]
        words = tweet.split(" ")
        res = []
        for i in words:
            res.append(multilang_word_vector(i, emb_dict, lang))
        return res
    return None

**Select path to df**

In [25]:
df_full = pd.read_pickle("../../delphes/data/extended_tweet_df")

**Select countries including the first value**

In [26]:
df = pd.DataFrame()
for i in ["Spain"]:
    df = pd.concat([df, df_full[df_full["country"] == i]])

In [27]:
df

Unnamed: 0,mep_id,name,country,group,nat_group,twitter,content
836,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Español,ClaraAguilera7,"Clara Aguilera: ""El criterio científico debe p..."
837,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Español,ClaraAguilera7,🇪🇺🎊🎊🎊as @EUfoodforum we are super proud that o...
838,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Español,ClaraAguilera7,"Clara Aguilera: ""El criterio científico debe p..."
839,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Español,ClaraAguilera7,"🗣️ @ClaraAguilera7: ""El criterio científico de..."
840,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Español,ClaraAguilera7,Última hora: Bélgica sitúa a toda España en ro...
...,...,...,...,...,...,...,...
136395,197621,Juan Ignacio ZOIDO ÁLVAREZ,Spain,Group of the European People's Party (Christia...,Partido Popular,zoidoji,Un partido como Podemos que forma parte del Go...
136396,197621,Juan Ignacio ZOIDO ÁLVAREZ,Spain,Group of the European People's Party (Christia...,Partido Popular,zoidoji,Y ahora @IreneMontero insiste. Algunos parecen...
136397,197621,Juan Ignacio ZOIDO ÁLVAREZ,Spain,Group of the European People's Party (Christia...,Partido Popular,zoidoji,Hace 29 años ETA atentó contra la casa cuartel...
136398,197621,Juan Ignacio ZOIDO ÁLVAREZ,Spain,Group of the European People's Party (Christia...,Partido Popular,zoidoji,"""La excusa que se ha puesto y la decisión que ..."


## Data clean

In [28]:
def rmurl_df(df, column_name):
    '''
    This function removes all the URLs, the #hashtag and the @user of a column made of strings.
    Be careful to apply it BEFORE all the other preprocessing steps (if not it wont'
    be recognized as a URL)
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    return df

In [29]:
def lower_df(df, column_name):
    '''
    This function lowercases a column made of strings and return the dataframe.
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.lower()
    return df

In [30]:
def rmnumbers_df(df, column_name):
    '''
    This function removes all the digits of a column made of strings.
    '''
    df = df.copy()
    def remove_numbers(text):
        return ''.join(word for word in text if not word.isdigit())
    df[column_name] = df[column_name].apply(remove_numbers)
    return df

In [31]:
def rmpunct_df(df, column_name):
    '''
    This function removes all the punctuations, all the "rt" and remove multiple spaces
    of a column made of strings.
    '''
    punct = string.punctuation
    df = df.copy()
    def replace_punct(text):
        for punctu in punct:
            text = text.replace(punctu, ' ')
            text = text.replace(' rt ','')
            text = " ".join(text.split())
        return text
    df[column_name] = df[column_name].apply(replace_punct)
    return df

In [32]:
def rmemojis_df(df):
    '''
    This function removes all the emojis of a column made of strings.
    Be careful to translate in latin alphabet before applying this function :
    it also removes cyrillic alphabet.
    '''
    df = df.copy()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    return df

In [33]:
nw_df = rmurl_df(df, "content")
nw_df = lower_df(nw_df, "content")
nw_df = rmnumbers_df(nw_df, "content")
nw_df = rmpunct_df(nw_df, "content")
nw_df = rmemojis_df(nw_df)

## Vector

In [34]:
nw_df

Unnamed: 0,mep_id,name,country,group,nat_group,twitter,content
836,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Espaol,ClaraAguilera7,clara aguilera el criterio cientfico debe prev...
837,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Espaol,ClaraAguilera7,as we are super proud that our member mep is n...
838,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Espaol,ClaraAguilera7,clara aguilera el criterio cientfico debe prev...
839,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Espaol,ClaraAguilera7,el criterio cientfico debe prevalecer ante la...
840,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Espaol,ClaraAguilera7,ltima hora blgica sita a toda espaa en rojo sa...
...,...,...,...,...,...,...,...
136395,197621,Juan Ignacio ZOIDO LVAREZ,Spain,Group of the European People's Party (Christia...,Partido Popular,zoidoji,un partido como podemos que forma parte del go...
136396,197621,Juan Ignacio ZOIDO LVAREZ,Spain,Group of the European People's Party (Christia...,Partido Popular,zoidoji,y ahora insiste algunos parecen empeados en us...
136397,197621,Juan Ignacio ZOIDO LVAREZ,Spain,Group of the European People's Party (Christia...,Partido Popular,zoidoji,hace aos eta atent contra la casa cuartel de d...
136398,197621,Juan Ignacio ZOIDO LVAREZ,Spain,Group of the European People's Party (Christia...,Partido Popular,zoidoji,la excusa que se ha puesto y la decisin que se...


In [None]:
vecs = nw_df.content.map(vect_tweet)

In [26]:
vecs = pd.DataFrame(vecs).reset_index()

In [27]:
vecs

Unnamed: 0,index,content
0,20738,"[False, False, [-0.0564866, 0.0520961, -0.0289..."
1,20739,"[False, [0.0680034, -0.0906367, -0.0319666, 0...."
2,20740,"[False, False, False, False, False, False, Fal..."
3,20741,"[[-0.0393396, 0.0268131, -0.142318, 0.0613933,..."
4,20742,"[[-0.0393396, 0.0268131, -0.142318, 0.0613933,..."
...,...,...
7795,130490,"[[0.0109718, -0.0593663, -0.0322032, 0.0416514..."
7796,130491,"[[0.0186351, -0.0140546, -0.0420493, -0.067852..."
7797,130492,"[[0.0489321, -0.0598671, -0.0264063, -0.014537..."
7798,130493,"[[-0.133483, 0.0550787, -0.0270976, 0.00313864..."


In [28]:
vecs.to_pickle('merger2')