# Vectoring Tweets

In [1]:
! pwd

/Users/simonpastor/code/simonjpastor/delphes/notebooks/simonjpastor_notebooks


In [2]:
import pandas as pd
import numpy as np
import io
from googletrans import Translator, LANGUAGES
from time import sleep
import string

## Embedding dict

In [3]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

**Select Languages**

In [4]:
languages = ["french","english"]

In [5]:
df

NameError: name 'df' is not defined

**Select path to languages embedding txt**

In [7]:
nmax = 100000  # maximum number of word embeddings to load
emb_dict = {}
for lang in languages:
    path = f"../../raw_data/vectors_{lang}.txt" #Select here
    embeddings, id2word, word2id = load_vec(path, nmax)
    emb_dict[lang] = [embeddings, id2word, word2id]

In [8]:
def multilang_word_vector(word, emb_dict, lang=None):
    translator = Translator()
    if lang == None: lang = LANGUAGES[translator.detect(word).lang]
    lang_val = LANGUAGES.values()
    if lang in lang_val and lang in emb_dict.keys():
        if word in emb_dict.get(lang)[2].keys():
            return emb_dict[lang][0][emb_dict[lang][2][word]]
    return False

In [9]:
def vect_tweet(tweet):
    print(f"{nw_df.content[nw_df.content == tweet].index[0]/len(nw_df.content)*100:.2f}%")
    translator = Translator()
    if translator.detect(tweet).lang in LANGUAGES.keys():
        lang = LANGUAGES[translator.detect(tweet).lang]
        words = tweet.split(" ")
        res = []
        for i in words:
            res.append(multilang_word_vector(i, emb_dict, lang))
        return res
    return None

**Select path to df**

In [10]:
df_full = pd.read_pickle("../../delphes/data/extended_tweet_df")

In [11]:
df_full["country"].unique()

array(['Poland', 'Bulgaria', 'Italy', 'Spain', 'Finland', 'Sweden',
       'Latvia', 'Germany', 'Greece', 'Luxembourg', 'Estonia', 'Belgium',
       'Romania', 'France', 'Denmark', 'Lithuania', 'Netherlands',
       'Slovakia', 'Hungary', 'Slovenia', 'Croatia', 'Portugal', 'Malta',
       'Cyprus', 'Ireland', 'Czechia', 'Austria'], dtype=object)

**Select countries including the first value**

In [12]:
df = df_full[df_full["country"] == "France"]
#for i in ["Denmark"]:
    #df = pd.concat([df, df_full[df_full["country"] == i]])

In [13]:
df.country.unique()

array(['France'], dtype=object)

## Data clean

In [14]:
def rmurl_df(df, column_name):
    '''
    This function removes all the URLs, the #hashtag and the @user of a column made of strings.
    Be careful to apply it BEFORE all the other preprocessing steps (if not it wont'
    be recognized as a URL)
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    return df

In [15]:
def lower_df(df, column_name):
    '''
    This function lowercases a column made of strings and return the dataframe.
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.lower()
    return df

In [16]:
def rmnumbers_df(df, column_name):
    '''
    This function removes all the digits of a column made of strings.
    '''
    df = df.copy()
    def remove_numbers(text):
        return ''.join(word for word in text if not word.isdigit())
    df[column_name] = df[column_name].apply(remove_numbers)
    return df

In [17]:
def rmpunct_df(df, column_name):
    '''
    This function removes all the punctuations, all the "rt" and remove multiple spaces
    of a column made of strings.
    '''
    punct = string.punctuation
    df = df.copy()
    def replace_punct(text):
        for punctu in punct:
            text = text.replace(punctu, ' ')
            text = text.replace(' rt ','')
            text = " ".join(text.split())
        return text
    df[column_name] = df[column_name].apply(replace_punct)
    return df

In [18]:
def rmemojis_df(df):
    '''
    This function removes all the emojis of a column made of strings.
    Be careful to translate in latin alphabet before applying this function :
    it also removes cyrillic alphabet.
    '''
    df = df.copy()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    return df

In [19]:
nw_df = rmurl_df(df, "content")
nw_df = lower_df(nw_df, "content")
nw_df = rmnumbers_df(nw_df, "content")
nw_df = rmpunct_df(nw_df, "content")
nw_df = rmemojis_df(nw_df)

## Vector

In [20]:
emb_dict.keys()

dict_keys(['french', 'english'])

In [21]:
nw_df.content[2:3]

4778    c est vrai qu il y avait plthore de masques au...
Name: content, dtype: object

In [23]:
vecs = nw_df.content[:].map(vect_tweet)

28.44%
28.45%
28.46%
28.46%
28.47%
28.47%
28.48%
28.49%
28.49%
28.50%
28.50%
28.51%
28.52%
28.52%
28.53%
28.53%
28.54%
28.55%
28.55%
28.56%
28.56%
28.57%
28.57%
28.58%
28.59%
28.59%
28.60%
28.60%
28.61%
28.62%
28.62%
28.63%
28.63%
28.64%
28.65%
28.65%
28.66%
28.66%
28.67%
28.68%
28.68%
28.69%
28.69%
28.70%
28.71%
28.71%
28.72%
28.72%
28.73%
28.74%
28.74%
28.75%
28.75%
28.76%
28.77%
28.77%
28.78%
28.78%
28.79%
28.80%
28.80%
28.81%
28.81%
28.82%
28.82%
28.83%
28.84%
28.84%
28.85%
28.85%
28.86%
28.87%
28.87%
28.88%
28.88%
28.89%
28.90%
28.90%
28.91%
28.91%
28.92%
28.93%
28.93%
28.94%
28.94%
28.95%
28.96%
28.96%
28.97%
28.97%
28.98%
28.99%
28.99%
29.00%
29.00%
29.01%
29.02%
29.02%
29.03%
29.03%
29.04%
29.05%
29.05%
29.06%
29.06%
29.07%
29.08%
29.08%
29.09%
29.09%
29.10%
29.10%
29.11%
29.12%
29.12%
29.13%
29.13%
29.14%
29.15%
29.15%
29.16%
29.16%
29.17%
29.18%
29.18%
29.19%
29.19%
29.20%
29.21%
29.21%
29.22%
29.22%
29.23%
29.24%
29.24%
29.25%
29.25%
29.26%
29.27%
29.27%
29.28%
29.28%
29.29%

52.50%
52.50%
52.51%
52.52%
52.52%
52.53%
52.49%
52.54%
52.55%
52.55%
52.56%
52.56%
52.57%
52.58%
52.58%
52.59%
52.59%
52.60%
52.61%
52.61%
52.62%
52.62%
52.63%
52.64%
52.64%
52.65%
38.36%
52.66%
52.67%
52.67%
37.40%
37.40%
52.69%
52.69%
52.70%
52.71%
52.71%
52.72%
52.72%
52.73%
52.74%
52.74%
52.75%
52.75%
52.76%
52.77%
52.77%
52.78%
52.78%
52.79%
52.80%
52.80%
52.81%
52.81%
52.82%
52.83%
52.83%
52.84%
52.84%
52.85%
52.86%
52.86%
52.87%
52.87%
52.88%
52.89%
52.89%
52.90%
52.90%
52.91%
52.92%
52.92%
52.93%
52.93%
52.94%
52.95%
52.95%
52.96%
52.96%
52.97%
52.97%
52.98%
52.99%
52.99%
53.00%
53.00%
53.01%
53.02%
53.02%
53.03%
53.03%
53.04%
53.05%
53.05%
53.06%
53.06%
53.07%
53.08%
53.08%
53.09%
53.09%
53.10%
53.11%
53.11%
53.12%
53.12%
53.13%
53.14%
53.14%
53.15%
53.15%
53.16%
53.17%
53.17%
53.18%
53.18%
53.19%
53.20%
53.20%
53.21%
53.21%
53.22%
53.22%
53.23%
53.24%
53.24%
53.25%
53.25%
53.26%
53.27%
53.27%
53.28%
53.28%
53.29%
53.30%
53.30%
53.31%
53.31%
53.32%
53.33%
53.33%
53.34%
53.34%

89.14%
89.15%
37.40%
89.16%
89.17%
89.17%
89.18%
87.82%
89.19%
89.20%
89.20%
89.21%
89.21%
89.22%
89.23%
89.23%
89.24%
89.24%
89.25%
89.26%
89.26%
89.24%
89.27%
89.28%
89.27%
37.40%
89.30%
89.30%
89.31%
89.32%
89.32%
89.33%
37.40%
37.40%
89.35%
29.34%
89.36%
89.36%
89.37%
89.38%
89.38%
89.39%
89.39%
89.40%
37.40%
89.41%
89.42%
89.42%
89.43%
89.43%
89.44%
89.45%
89.45%
89.46%
89.46%
89.47%
89.48%
89.48%
96.64%
96.64%
96.65%
96.65%
96.66%
96.66%
96.67%
96.68%
96.68%
37.40%
96.69%
96.70%
37.40%
96.71%
96.72%
96.72%
96.73%
96.74%
96.74%
96.75%
96.75%
96.76%
96.77%
96.77%
96.78%
37.40%
37.40%
37.40%
96.80%
96.81%
96.81%
96.82%
96.83%
96.83%
96.84%
96.84%
96.85%
96.86%
96.86%
96.87%
96.87%
96.88%
96.89%
96.89%
96.90%
96.90%
96.91%
96.92%
96.92%
96.93%
96.93%
96.94%
96.94%
96.95%
96.96%
96.96%
96.97%
96.97%
96.98%
96.99%
96.99%
97.00%
97.00%
97.01%
97.02%
97.02%
97.03%
97.03%
97.04%
97.05%
97.05%
97.06%
97.06%
97.07%
97.08%
97.08%
97.09%
97.09%
97.10%
97.11%
97.11%
97.12%
97.12%
97.13%
97.14%

120.53%
120.54%
120.55%
120.55%
120.56%
120.56%
120.57%
120.58%
120.58%
120.59%
120.59%
120.60%
120.61%
120.61%
120.62%
120.62%
120.63%
120.64%
120.64%
120.65%
120.65%
120.66%
120.67%
120.67%
120.68%
120.68%
120.69%
120.70%
106.04%
120.71%
120.71%
120.72%
120.73%
120.73%
120.74%
120.74%
120.75%
120.76%
120.76%
120.77%
120.77%
120.78%
96.98%
120.79%
120.80%
120.80%
120.81%
37.40%
120.82%
120.83%
120.83%
120.84%
120.84%
120.85%
37.40%
120.86%
120.87%
120.87%
120.88%
120.89%
120.89%
120.90%
120.90%
120.91%
120.92%
120.92%
120.93%
120.93%
120.94%
120.95%
120.95%
120.96%
120.96%
120.97%
120.98%
120.98%
120.99%
120.99%
121.00%
121.01%
121.01%
121.02%
121.02%
121.03%
37.40%
121.04%
121.05%
121.05%
121.06%
121.06%
121.07%
121.08%
121.08%
121.09%
121.09%
121.10%
121.11%
121.11%
121.12%
121.12%
121.13%
121.14%
121.14%
121.15%
121.15%
121.16%
106.40%
121.17%
121.18%
121.18%
121.19%
121.20%
121.20%
121.21%
121.21%
121.22%
121.23%
121.23%
121.24%
121.24%
121.25%
121.26%
121.26%
121.27%
121.27%
121.

141.52%
141.53%
141.53%
141.54%
141.55%
141.55%
141.56%
141.56%
141.57%
141.58%
141.58%
141.59%
141.59%
141.60%
141.61%
141.61%
141.62%
141.62%
141.63%
141.64%
141.64%
141.65%
141.65%
141.66%
141.67%
141.67%
141.68%
141.68%
141.69%
141.69%
141.70%
141.69%
141.71%
141.72%
141.72%
141.73%
141.74%
141.74%
141.75%
141.75%
141.76%
141.77%
141.77%
141.78%
141.78%
141.79%
141.80%
141.80%
141.81%
141.81%
141.82%
141.83%
141.83%
141.84%
141.84%
141.85%
141.86%
141.86%
141.87%
145.45%
145.45%
145.46%
145.46%
145.47%
145.48%
145.48%
96.86%
145.49%
145.50%
145.51%
145.51%
145.52%
145.52%
145.53%
145.54%
145.54%
145.55%
145.55%
37.40%
145.57%
145.57%
145.58%
145.58%
145.59%
145.60%
145.60%
145.61%
145.61%
145.62%
145.63%
145.63%
145.64%
145.64%
145.65%
145.66%
145.66%
145.67%
145.67%
145.68%
145.69%
145.69%
145.70%
145.70%
145.71%
145.71%
145.72%
145.73%
145.73%
145.74%
145.74%
145.75%
145.76%
145.76%
145.77%
145.77%
145.78%
145.79%
145.79%
145.80%
145.80%
145.81%
145.82%
145.82%
145.83%
145.83%
14

154.79%
154.80%
154.80%
154.81%
154.82%
154.82%
154.83%
154.83%
154.84%
154.84%
154.85%
154.86%
154.86%
154.87%
154.87%
154.88%
78.11%
154.89%
154.90%
154.90%
154.91%
154.92%
154.92%
154.93%
154.93%
154.94%
154.95%
154.95%
154.96%
154.96%
154.97%
154.98%
154.98%
154.99%
154.99%
155.00%
155.01%
155.01%
155.02%
29.18%
155.03%
155.04%
155.04%
155.05%
155.05%
155.06%
155.07%
37.40%
155.08%
155.08%
155.09%
155.09%
155.10%
155.11%
78.15%
155.12%
155.12%
155.13%
155.14%
155.14%
155.15%
155.15%
155.16%
155.17%
37.40%
155.18%
155.18%
155.19%
155.20%
155.20%
155.21%
155.21%
155.22%
155.23%
155.23%
37.40%
155.24%
155.25%
155.26%
155.26%
155.27%
155.27%
155.28%
155.29%
155.29%
155.30%
155.30%
155.31%
155.32%
155.32%
155.33%
155.33%
155.34%
155.35%
155.35%
155.36%
155.36%
155.37%
155.37%
155.38%
155.39%
78.21%
155.40%
155.40%
155.41%
155.42%
155.42%
155.43%
155.43%
155.44%
155.45%
155.45%
155.46%
155.46%
78.26%
155.48%
155.48%
155.49%
155.49%
155.50%
155.51%
155.51%
155.52%
155.52%
155.53%
155.54%


185.92%
185.93%
185.90%
185.94%
185.94%
185.96%
185.96%
185.97%
185.97%
185.98%
185.99%
185.99%
186.00%
186.00%
78.96%
186.02%
186.02%
186.02%
186.00%
186.04%
186.05%
186.05%
186.06%
186.06%
186.07%
186.08%
186.06%
186.09%
182.48%
186.10%
186.08%
186.11%
186.12%
186.12%
186.13%
186.14%
186.13%
186.15%
186.15%
186.16%
186.17%
186.17%
186.18%
186.18%
186.15%
186.19%
186.20%
186.21%
186.21%
186.22%
186.22%
186.23%
186.24%
186.24%
186.25%
186.25%
186.26%
186.27%
186.27%
186.28%
186.28%
186.29%
186.30%
186.30%
186.31%
186.31%
186.32%
186.33%
186.33%
186.30%
186.34%
186.35%
186.36%
186.36%
186.37%
186.37%
186.38%
186.39%
186.39%
186.40%
186.40%
186.41%
186.40%
186.42%
186.43%
186.43%
186.44%
186.45%
186.45%
186.46%
186.46%
186.47%
186.47%
186.47%
186.49%
186.49%
186.50%
186.50%
186.51%
186.52%
186.52%
186.53%
186.53%
191.90%
191.91%
191.91%
191.92%
191.92%
191.93%
191.94%
191.94%
191.95%
191.95%
191.96%
191.97%
191.97%
191.98%
191.98%
191.99%
192.00%
192.00%
192.01%
192.01%
192.02%
192.03%
1

283.96%
283.97%
283.97%
283.98%
283.99%
283.99%
284.00%
284.00%
77.72%
284.02%
284.02%
284.03%
284.03%
284.04%
284.05%
284.05%
284.06%
284.06%
284.07%
155.14%
284.08%
284.09%
284.09%
284.10%
284.10%
284.11%
284.12%
284.12%
284.13%
284.13%
284.14%
284.15%
284.15%
284.16%
284.16%
284.17%
284.18%
284.18%
284.19%
191.94%
284.20%
284.21%
284.21%
284.22%
284.22%
37.40%
284.24%
284.24%
284.25%
284.25%
284.26%
37.40%
284.27%
284.28%
284.28%
284.29%
284.30%
284.30%
284.31%
284.31%
284.32%
284.32%
284.33%
284.34%
284.34%
284.35%
284.35%
284.36%
284.37%
284.37%
284.38%
284.38%
284.39%
284.40%
284.40%
284.41%
284.41%
284.42%
284.43%
284.43%
284.44%
284.44%
121.88%
284.46%
284.46%
284.47%
78.71%
284.48%
284.49%
284.49%
284.50%
284.50%
284.51%
284.52%
284.52%
284.53%
284.53%
284.54%
284.55%
284.55%
284.56%
284.56%
284.57%
284.58%
284.58%
284.59%
284.59%
284.60%
37.40%
284.61%
284.62%
284.62%
284.63%
284.63%
284.64%
284.65%
284.65%
284.66%
284.66%
284.67%
284.68%
284.68%
284.69%
284.69%
37.40%
284.71

ConnectTimeout: _ssl.c:1059: The handshake operation timed out

In [None]:
vecs.to_pickle("bulgarian")

In [None]:
#vecs.to_csv("dutch_danish.csv")

In [None]:
translator = Translator()
LANGUAGES[translator.detect('moramo znati kakve proizvode kupujemo').lang]

In [None]:
data_de_ouf = pd.read_csv("dutch_danish.csv")

In [None]:
data_de_ouf.columns

In [None]:
data_de_ouf.set_index("Unnamed: 0",inplace=True)

In [None]:
data_de_ouf[""]

In [None]:
df_attempt = pd.read_pickle("dutch_danish")

In [None]:
pickle_df = pd.DataFrame(df_attempt)
pickle_df = pickle_df.reset_index()

In [None]:
pickle_df

In [None]:
pickle_df[pickle_df["content"]=="None"]

In [None]:
old_data = pd.read_pickle("../../delphes/data/extended_tweet_df")
old_data = old_data.reset_index()

In [None]:
merger = old_data.merge(pickle_df,on="index")

In [None]:
merger.to_pickle("merger")

In [None]:
merger