# Vectoring Tweets

In [1]:
import pandas as pd
import numpy as np
import io
from googletrans import Translator, LANGUAGES
from time import sleep
import string

## Embedding dict

In [2]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

**Select Languages**

In [3]:
languages = ["french", "german"]

**Select path to languages embedding txt**

In [4]:
nmax = 100000  # maximum number of word embeddings to load
emb_dict = {}
for lang in languages:
    path = f"../../raw_data/vectors_{lang}.txt" #Select here
    embeddings, id2word, word2id = load_vec(path, nmax)
    emb_dict[lang] = [embeddings, id2word, word2id]

In [5]:
emb_dict.keys()

dict_keys(['french', 'german'])

In [6]:
def multilang_word_vector(word, emb_dict, lang=None):
    translator = Translator()
    if lang == None: lang = LANGUAGES[translator.detect(word).lang]
    lang_val = LANGUAGES.values()
    if lang in lang_val and lang in emb_dict.keys():
        if word in emb_dict.get(lang)[2].keys():
            return emb_dict[lang][0][emb_dict[lang][2][word]]
    return False  

In [47]:
def vect_tweet(tweet):
    print(f"{nw_df.content[nw_df.content == tweet].index[0]/len(nw_df.content)*100:.2f}%")
    translator = Translator()
    if translator.detect(tweet).lang[:2] in LANGUAGES.keys():
        lang = LANGUAGES[translator.detect(tweet).lang[:2]]
        words = tweet.split(" ")
        res = []
        for i in words:
            res.append(multilang_word_vector(i, emb_dict, lang))
        return res
    return None

In [19]:
len(nw_df.content)

16791

**Select path to df**

In [8]:
df_full = pd.read_pickle("../../delphes/data/extended_tweet_df")

**Select countries including the first value**

In [9]:
df_full["country"].unique()

array(['Poland', 'Bulgaria', 'Italy', 'Spain', 'Finland', 'Sweden',
       'Latvia', 'Germany', 'Greece', 'Luxembourg', 'Estonia', 'Belgium',
       'Romania', 'France', 'Denmark', 'Lithuania', 'Netherlands',
       'Slovakia', 'Hungary', 'Slovenia', 'Croatia', 'Portugal', 'Malta',
       'Cyprus', 'Ireland', 'Czechia', 'Austria'], dtype=object)

In [10]:
df = df_full[df_full["country"] == "France"]
# for i in ["Germany"]:
#     df = pd.concat([df, df_full[df_full["country"] == i]])

## Data clean

In [11]:
def rmurl_df(df, column_name):
    '''
    This function removes all the URLs, the #hashtag and the @user of a column made of strings.
    Be careful to apply it BEFORE all the other preprocessing steps (if not it wont'
    be recognized as a URL)
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    return df

In [12]:
def lower_df(df, column_name):
    '''
    This function lowercases a column made of strings and return the dataframe.
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.lower()
    return df

In [13]:
def rmnumbers_df(df, column_name):
    '''
    This function removes all the digits of a column made of strings.
    '''
    df = df.copy()
    def remove_numbers(text):
        return ''.join(word for word in text if not word.isdigit())
    df[column_name] = df[column_name].apply(remove_numbers)
    return df

In [14]:
def rmpunct_df(df, column_name):
    '''
    This function removes all the punctuations, all the "rt" and remove multiple spaces
    of a column made of strings.
    '''
    punct = string.punctuation
    df = df.copy()
    def replace_punct(text):
        for punctu in punct:
            text = text.replace(punctu, ' ')
            text = text.replace(' rt ','')
            text = " ".join(text.split())
        return text
    df[column_name] = df[column_name].apply(replace_punct)
    return df

In [15]:
def rmemojis_df(df):
    '''
    This function removes all the emojis of a column made of strings.
    Be careful to translate in latin alphabet before applying this function :
    it also removes cyrillic alphabet.
    '''
    df = df.copy()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    return df

In [32]:
nw_df = rmurl_df(df, "content")
nw_df = lower_df(nw_df, "content")
nw_df = rmnumbers_df(nw_df, "content")
nw_df = rmpunct_df(nw_df, "content")
nw_df = rmemojis_df(nw_df).reset_index()

## Vector

In [None]:
vecs = nw_df.content.map(vect_tweet)

0.00%
0.01%
0.01%
0.02%
0.02%
0.03%
0.04%
0.04%
0.05%
0.05%
0.06%
0.07%
0.07%
0.08%
0.08%
0.09%
0.10%
0.10%
0.11%
0.11%
0.12%
0.13%
0.13%
0.14%
0.14%
0.15%
0.15%
0.16%
0.17%
0.17%
0.18%
0.18%
0.19%
0.20%
0.20%
0.21%
0.21%
0.22%
0.23%
0.23%
0.24%
0.24%
0.25%
0.26%
0.26%
0.27%
0.27%
0.28%
0.29%
0.29%
0.30%
0.30%
0.31%
0.32%
0.32%
0.33%
0.33%
0.34%
0.35%
0.35%
0.36%
0.36%
0.37%
0.38%
0.38%
0.39%
0.39%
0.40%
0.40%
0.41%
0.42%
0.42%
0.43%
0.43%
0.44%
0.45%
0.45%
0.46%
0.46%
0.47%
0.48%
0.48%
0.49%
0.49%
0.50%
0.51%
0.51%
0.52%
0.52%
0.53%
0.54%
0.54%
0.55%
0.55%
0.56%
0.57%
0.57%
0.58%
0.58%
0.59%
0.60%
0.60%
0.61%
0.61%
0.62%
0.63%
0.63%
0.64%
0.64%
0.65%
0.66%
0.66%
0.67%
0.67%
0.68%
0.68%
0.69%
0.70%
0.70%
0.71%
0.71%
0.72%
0.73%
0.73%
0.74%
0.74%
0.75%
0.76%
0.76%
0.77%
0.77%
0.78%
0.79%
0.79%
0.80%
0.80%
0.81%
0.82%
0.82%
0.83%
0.83%
0.84%
0.85%
0.85%
0.86%
0.86%
0.87%
0.88%
0.88%
0.89%
0.89%
0.90%
0.91%
0.91%
0.90%
0.92%
0.93%
0.94%
0.94%
0.95%
0.95%
0.96%
0.96%
0.97%
0.98%
0.98%
0.99

8.14%
8.14%
8.15%
8.15%
8.16%
8.17%
8.17%
8.18%
8.18%
8.19%
8.19%
8.20%
8.21%
8.21%
8.22%
8.22%
8.23%
8.24%
8.24%
8.25%
8.25%
8.26%
8.27%
8.27%
8.28%
8.28%
8.29%
8.30%
8.30%
8.31%
8.31%
8.32%
8.33%
8.33%
8.34%
8.34%
8.35%
8.36%
8.36%
8.37%
8.37%
8.38%
8.39%
8.39%
8.40%
8.40%
8.41%
8.42%
8.42%
8.43%
8.43%
8.44%
8.45%
8.45%
8.46%
8.46%
8.47%
8.47%
8.48%
8.49%
8.49%
8.50%
8.50%
8.51%
8.52%
8.52%
8.53%
8.53%
8.54%
8.55%
8.55%
8.56%
8.56%
8.57%
8.58%
8.58%
8.59%
8.59%
8.60%
8.61%
8.61%
8.62%
8.62%
8.63%
8.64%
8.64%
8.65%
8.65%
8.66%
8.67%
8.67%
8.68%
8.68%
8.69%
8.70%
8.70%
8.71%
8.71%
8.72%
8.72%
8.73%
8.74%
8.74%
8.75%
8.75%
8.76%
8.77%
8.77%
8.78%
8.58%
8.79%
8.80%
8.80%
8.81%
8.81%
8.82%
8.83%
8.83%
8.84%
8.84%
8.85%
8.86%
8.86%
8.87%
8.87%
8.88%
8.89%
8.89%
8.90%
8.90%
8.91%
8.92%
8.92%
8.93%
8.93%
8.94%
8.95%
8.95%
8.96%
8.96%
8.97%
1.81%
8.98%
8.99%
8.99%
9.00%
9.00%
9.01%
9.02%
9.02%
9.02%
9.03%
9.04%
9.05%
9.05%
9.06%
9.06%
9.05%
9.08%
9.08%
9.09%
9.09%
9.10%
9.11%
9.11%
9.11%
9.11

15.40%
15.40%
15.41%
15.41%
15.42%
15.42%
15.43%
15.44%
15.44%
15.45%
15.45%
15.46%
15.47%
15.47%
15.48%
15.48%
15.49%
15.50%
15.50%
15.51%
15.51%
15.52%
15.53%
15.53%
1.81%
15.54%
15.55%
1.81%
15.56%
15.57%
15.57%
15.58%
15.59%
15.59%
15.60%
15.60%
14.52%
15.62%
15.62%
15.63%
15.63%
15.64%
15.65%
15.65%
15.66%
15.66%
15.67%
15.68%
15.68%
15.69%
15.69%
15.70%
15.70%
15.44%
15.72%
15.72%
15.73%
15.73%
15.74%
15.75%
15.75%
15.76%
15.76%
14.52%
15.78%
15.78%
15.79%
15.79%
15.80%
15.81%
15.81%
15.82%
15.82%
15.83%
15.84%
15.84%
15.85%
15.85%
15.86%
15.87%
15.87%
15.88%
15.88%
15.89%
15.90%
15.90%
15.91%
15.91%
15.92%
15.93%
15.93%
15.94%
15.94%
15.95%
15.95%
15.96%
15.97%
15.97%
15.98%
15.98%
15.99%
16.00%
16.00%
16.01%
16.01%
16.02%
16.03%
16.03%
16.04%
16.04%
16.05%
16.06%
16.06%
16.07%
16.07%
16.08%
16.09%
16.09%
16.10%
16.10%
16.11%
16.12%
16.12%
16.13%
16.13%
16.14%
16.15%
9.17%
16.16%
16.16%
16.17%
16.18%
16.18%
16.19%
16.19%
16.20%
16.21%
16.21%
16.22%
16.22%
16.23%
16.23%
16.24%
16

22.38%
22.39%
22.39%
22.40%
22.40%
22.41%
22.42%
22.42%
22.43%
22.43%
22.44%
22.45%
22.45%
22.46%
22.46%
22.47%
22.48%
22.48%
22.49%
22.49%
22.50%
22.51%
22.51%
22.52%
22.52%
22.53%
22.54%
22.54%
22.55%
22.55%
22.56%
22.57%
22.57%
22.58%
22.58%
22.59%
22.60%
22.60%
22.61%
22.61%
22.62%
22.63%
22.63%
22.64%
22.64%
22.65%
22.65%
22.66%
22.67%
22.67%
22.68%
22.68%
22.69%
22.70%
22.70%
22.71%
22.71%
22.72%
22.73%
22.73%
22.74%
22.74%
22.75%
22.76%
22.76%
22.77%
22.77%
22.78%
22.79%
22.79%
22.80%
22.80%
22.81%
22.82%
22.82%
22.83%
22.83%
22.84%
22.85%
22.85%
22.86%
22.86%
22.87%
22.88%
22.88%
22.89%
22.89%
22.90%
22.91%
22.91%
22.92%
22.92%
22.93%
22.93%
22.94%
22.95%
22.95%
22.96%
22.96%
22.97%
22.98%
22.98%
22.99%
22.99%
23.00%
23.01%
23.01%
23.02%
23.02%
23.03%
23.04%
23.04%
23.05%
23.05%
23.06%
23.07%
23.07%
23.08%
23.08%
23.09%
23.10%
23.10%
23.11%
23.11%
23.12%
23.13%
23.13%
23.14%
23.14%
23.15%
23.16%
23.16%
23.17%
23.17%
23.18%
23.19%
23.19%
23.20%
23.20%
23.21%
23.21%
23.22%
23.23%

29.37%
29.37%
29.38%
29.38%
29.39%
29.40%
29.40%
29.41%
29.41%
29.42%
29.43%
29.43%
1.81%
29.44%
29.45%
29.46%
29.46%
29.47%
29.47%
29.48%
29.49%
29.49%
29.50%
29.50%
29.51%
29.52%
29.52%
29.53%
29.53%
29.54%
29.55%
29.55%
29.56%
29.56%
29.57%
29.58%
29.58%
29.59%
29.59%
29.60%
29.61%
29.61%
29.62%
29.62%
29.63%
29.63%
29.64%
29.65%
29.65%
1.81%
29.66%
29.67%
29.68%
29.68%
29.69%
29.69%
29.70%
29.71%
29.71%
1.81%
29.72%
29.73%
29.74%
29.74%
29.75%
29.75%
29.76%
29.77%
29.77%
29.78%
29.78%
29.79%
29.80%
29.80%
29.81%
29.81%
29.82%
29.83%
29.83%
29.84%
29.84%
29.85%
29.86%
29.86%
29.87%
1.81%
29.88%
29.89%
29.89%
29.90%
29.90%
29.91%
29.91%
29.92%
29.93%
29.93%
29.94%
29.94%
29.95%
29.96%
29.96%
1.81%
29.97%
1.81%
29.99%
29.99%
30.00%
30.00%
30.01%
30.02%
30.02%
30.03%
30.03%
30.04%
30.05%
30.05%
30.06%
30.06%
30.07%
30.08%
30.08%
30.09%
30.09%
30.10%
30.11%
30.11%
30.12%
30.12%
30.13%
30.14%
8.81%
30.15%
30.15%
30.16%
30.16%
30.17%
30.18%
30.18%
30.19%
30.19%
30.20%
30.21%
30.21%
30.22%

36.36%
36.36%
36.37%
36.38%
36.38%
36.35%
36.39%
36.40%
14.12%
36.41%
36.42%
36.42%
36.43%
11.79%
36.44%
36.45%
36.45%
36.46%
36.47%
36.47%
36.48%
36.48%
36.49%
36.50%
36.50%
36.51%
36.51%
36.52%
36.53%
36.53%
36.54%
36.54%
36.55%
36.56%
36.56%
36.57%
36.57%
36.58%
36.59%
36.59%
36.60%
36.60%
36.61%
36.61%
36.62%
36.63%
36.63%
36.64%
36.64%
36.65%
36.66%
36.66%
36.67%
36.67%
36.68%
36.69%
36.69%
36.70%
36.70%
36.71%
36.72%
36.72%
36.73%


In [None]:
vecs

In [36]:
3/len(nw_df.content)*100

0.017866714311238163

In [None]:
translator = Translator()
translator.detect("biljana borzan neu stati dok ne osiguram da graani dobiju najbolje za svoj novac").lang[:2]

In [None]:
translator.detect("hrvatska eurozastupnica te potpredsjednica").lang in LANGUAGES.keys()

In [None]:
LANGUAGES.keys()

In [None]:
LANGUAGES.values()