# Traduction Google trad

## Imports

In [None]:
import pandas as pd
import numpy as np
from googletrans import Translator, LANGUAGES
from time import sleep

In [None]:
df = pd.read_csv("../../delphes/data/final2_clean.csv", index_col=0)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
sample = df.content.map(lambda x: x.split(","))

## Create the translator

In [None]:
translator = Translator()

In [None]:
def trad(X):
    res = []
    count = 0
    lan = []
    for i in X:
        lan.append(translator.detect(i).lang)
#         l = translator.detect(i).lang
#         if l in LANGUAGES.keys(): res.append(translator.translate(i, src=l, dest='en').text)
        count +=1
        print(count)
#         sleep(1)
    return lan

In [None]:
trad_content = sample[:2].apply(trad)

## Test Translator

In [None]:
translator.translate("представя приоритетите на германското председателство", dest='en').text

In [None]:
trad = translator.detect("Асим Адемов: ЕС дава 50 млн. евро за еднократни помощи за земеделците с българската мярка Регламент COVID-19")
print(trad.lang)
print(trad.confidence)

In [None]:
set(trad(sample[1]))

In [None]:
len(sample[1])

In [None]:
test = translator.translate(sample[1], dest='en')

In [None]:
for t in test:
    print(t.text)
    print("\n###########\n")

# Muse

### Exemple

In [1]:
import io
import numpy as np

In [2]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [None]:
src_path = '../../raw_data/vectors_english.txt'
tgt_path = '../../raw_data/vectors_spanish.txt'
nmax = 50000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

In [3]:
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    print("Nearest neighbors of \"%s\":" % word)
    word2id = {v: k for k, v in src_id2word.items()}
    word_emb = src_emb[word2id[word]]
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))

In [None]:
# printing nearest neighbors in the source space
src_word = 'cat'
get_nn(src_word, src_embeddings, src_id2word, src_embeddings, src_id2word, K=5)

In [None]:
# printing nearest neighbors in the target space
src_word = 'cat'
get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=True)  # TSNE(n_components=2, n_iter=3000, verbose=2)
pca.fit(np.vstack([src_embeddings, tgt_embeddings]))
print('Variance explained: %.2f' % pca.explained_variance_ratio_.sum())

In [None]:
import matplotlib.pyplot as plt


def plot_similar_word(src_words, src_word2id, src_emb, tgt_words, tgt_word2id, tgt_emb, pca):

    Y = []
    word_labels = []
    for sw in src_words:
        Y.append(src_emb[src_word2id[sw]])
        word_labels.append(sw)
    for tw in tgt_words:
        Y.append(tgt_emb[tgt_word2id[tw]])
        word_labels.append(tw)

    # find tsne coords for 2 dimensions
    Y = pca.transform(Y)
    x_coords = Y[:, 0]
    y_coords = Y[:, 1]

    # display scatter plot
    plt.figure(figsize=(10, 8), dpi=80)
    plt.scatter(x_coords, y_coords, marker='x')

    for k, (label, x, y) in enumerate(zip(word_labels, x_coords, y_coords)):
        color = 'blue' if k < len(src_words) else 'red'  # src words in blue / tgt words in red
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points', fontsize=19,
                     color=color, weight='bold')

    plt.xlim(x_coords.min() - 0.2, x_coords.max() + 0.2)
    plt.ylim(y_coords.min() - 0.2, y_coords.max() + 0.2)
    plt.title('Visualization of the multilingual word embedding space')

    plt.show()

In [None]:
# get 5 random input words
src_words = ['university', 'love', 'history', 'tennis', 'research', 'conference']
tgt_words = ['universidad', 'amor', 'historia', u'tenis',  u'investigación', 'conferencia']

# assert words in dictionaries
for sw in src_words:
    assert sw in src_word2id, '"%s" not in source dictionary' % sw
for tw in tgt_words:
    assert tw in tgt_word2id, '"%s" not in target dictionary' % sw

plot_similar_word(src_words, src_word2id, src_embeddings, tgt_words, tgt_word2id, tgt_embeddings, pca)


### Full Language application

In [4]:
import pandas as pd
import numpy as np
from googletrans import Translator, LANGUAGES
from time import sleep
import string


In [None]:
languages_full = ["bulgarian", "catalan", "croatian", "czech", "danish", "dutch", "english", "estonian", 
              "finnish", "french", "german", "greek", "hungarian", "italian", "macedonian", "norwegian", 
              "polish", "portuguese", "romanian", "russian", "slovak", "slovenian", "spanish", "swedish", 
              "ukrainian"]

In [5]:
languages = ["english", "french", "german", "italian", "polish", "portuguese", "spanish"]

In [6]:
nmax = 100000  # maximum number of word embeddings to load
emb_dict = {}
for lang in languages:
    path = f"../../raw_data/vectors_{lang}.txt"
    embeddings, id2word, word2id = load_vec(path, nmax)
    emb_dict[lang] = [embeddings, id2word, word2id]

In [7]:
def multilang_word_vector(word, emb_dict, lang=None):
    translator = Translator()
    if lang == None: lang = LANGUAGES[translator.detect(word).lang]
    lang_val = LANGUAGES.values()
    
#     try:
    if lang in lang_val and word in emb_dict.get(lang)[2].keys():
        return emb_dict[lang][0][emb_dict[lang][2][word]]
#     except:
#         import ipdb; ipdb.set_trace()
    
    return False                     

In [42]:
def vect_tweet(tweet):
    translator = Translator()
    if translator.detect(tweet).lang in LANGUAGES.keys():
        lang = LANGUAGES[translator.detect(tweet).lang]
        words = tweet.split(" ")
        res = []
        for i in words:
            res.append(multilang_word_vector(i, emb_dict, lang))
        return res
    return None

In [None]:
LANGUAGES[translator.detect("asdgvatesdrjyvxdz").lang]

In [None]:
emb_dict["spanish"][2].keys()

In [13]:
df_full = pd.read_pickle("../../delphes/data/extended_tweet_df")

In [14]:
df_full.country.unique()

array(['Poland', 'Bulgaria', 'Italy', 'Spain', 'Finland', 'Sweden',
       'Latvia', 'Germany', 'Greece', 'Luxembourg', 'Estonia', 'Belgium',
       'Romania', 'France', 'Denmark', 'Lithuania', 'Netherlands',
       'Slovakia', 'Hungary', 'Slovenia', 'Croatia', 'Portugal', 'Malta',
       'Cyprus', 'Ireland', 'Czechia', 'Austria'], dtype=object)

In [59]:
# df = df_full[df_full["country"]  ["Ireland", "France", "Germany", "Italy", "Poland", "Portugal", "Spain"]]

df = df_full[df_full["country"] == "Ireland"]
for i in ["Poland", "Spain"]:
    df = pd.concat([df, df_full[df_full["country"] == i]])

In [60]:
df.country.unique()

array(['Ireland', 'Poland', 'Spain'], dtype=object)

In [None]:
vect_tweet(df["content"][0])

# Clean to vec

### data clean

In [None]:
df = pd.read_pickle("../../delphes/data/extended_tweet_df")

In [21]:
def rmurl_df(df, column_name):
    '''
    This function removes all the URLs, the #hashtag and the @user of a column made of strings.
    Be careful to apply it BEFORE all the other preprocessing steps (if not it wont'
    be recognized as a URL)
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    return df

In [22]:
def lower_df(df, column_name):
    '''
    This function lowercases a column made of strings and return the dataframe.
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.lower()
    return df

In [23]:
def rmnumbers_df(df, column_name):
    '''
    This function removes all the digits of a column made of strings.
    '''
    df = df.copy()
    def remove_numbers(text):
        return ''.join(word for word in text if not word.isdigit())
    df[column_name] = df[column_name].apply(remove_numbers)
    return df

In [24]:
def rmpunct_df(df, column_name):
    '''
    This function removes all the punctuations, all the "rt" and remove multiple spaces
    of a column made of strings.
    '''
    punct = string.punctuation
    df = df.copy()
    def replace_punct(text):
        for punctu in punct:
            text = text.replace(punctu, ' ')
            text = text.replace(' rt ','')
            text = " ".join(text.split())
        return text
    df[column_name] = df[column_name].apply(replace_punct)
    return df

In [25]:
def rmstopwords_df(df, column_name):
    '''
    This function removes all the stopwords of a column made of strings.
    '''
    df = df.copy()
    stop_words = stopwords.words('english')
    def remove_stopwords(text):
        for word in stop_words:
            text = text.replace(f' {word} ', ' ')
        return text
    df[column_name] = df[column_name].apply(remove_stopwords)
    return df

In [26]:
def rmemojis_df(df):
    '''
    This function removes all the emojis of a column made of strings.
    Be careful to translate in latin alphabet before applying this function :
    it also removes cyrillic alphabet.
    '''
    df = df.copy()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    return df

In [61]:
nw_df = rmurl_df(df, "content")
nw_df = lower_df(nw_df, "content")
nw_df = rmnumbers_df(nw_df, "content")
nw_df = rmpunct_df(nw_df, "content")
nw_df = rmemojis_df(nw_df)

### To vec

In [55]:
nw_df.content[:10]

24722    the northern ireland protocol must be protecte...
24723    as a member of the special committee on beatin...
24724    this month in we are shining a light on childr...
24725    yesterday i told that in europe we need to kee...
24726    this is the letter sent to an taoiseach to be ...
24727    the guidance on preparing for the end of the t...
24728    many irish importers and exporters rely on the...
24729    worried amp disappointed by the failure to eng...
24730    the withdrawal agreement is only way to protec...
24731    the public consultation is open until septembe...
Name: content, dtype: object

In [62]:
test = nw_df.content[:10].map(vect_tweet)

In [63]:
nw_df.shape

(28284, 7)

In [64]:
28284 / 10 * 6 / 60 /60

4.714