In [64]:
import pandas as pd
import numpy as np
import io
from googletrans import Translator, LANGUAGES
from time import sleep
import string

df_full = pd.read_pickle("../../delphes/data/extended_tweet_df")
print("df_full imported")

df = df_full[df_full["country"] == "Poland"]
for i in ["Bulgaria", "Italy", "Spain", "Finland", "Sweden", "Latvia", "Germany", "Greece", "Luxembourg", "Estonia", "Belgium", "Romania", "France", "Denmark", "Lithuania", "Netherlands", "Slovakia", "Hungary", "Slovenia", "Croatia", "Portugal", "Malta", "Cyprus", "Ireland", "Czechia", "Austria"]:
    df = pd.concat([df, df_full[df_full["country"] == i]])
print("Countries selected")
    
def rmurl_df(df, column_name):
    '''
    This function removes all the URLs, the #hashtag and the @user of a column made of strings.
    Be careful to apply it BEFORE all the other preprocessing steps (if not it wont'
    be recognized as a URL)
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    return df

def lower_df(df, column_name):
    '''
    This function lowercases a column made of strings and return the dataframe.
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.lower()
    return df

def rmnumbers_df(df, column_name):
    '''
    This function removes all the digits of a column made of strings.
    '''
    df = df.copy()
    def remove_numbers(text):
        return ''.join(word for word in text if not word.isdigit())
    df[column_name] = df[column_name].apply(remove_numbers)
    return df

def rmpunct_df(df, column_name):
    '''
    This function removes all the punctuations, all the "rt" and remove multiple spaces
    of a column made of strings.
    '''
    punct = string.punctuation
    df = df.copy()
    def replace_punct(text):
        for punctu in punct:
            text = text.replace(punctu, ' ')
            text = text.replace(' rt ','')
            text = " ".join(text.split())
        return text
    df[column_name] = df[column_name].apply(replace_punct)
    return df

def rmemojis_df(df):
    '''
    This function removes all the emojis of a column made of strings.
    Be careful to translate in latin alphabet before applying this function :
    it also removes cyrillic alphabet.
    '''
    df = df.copy()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    return df

languages = ["polish", "bulgarian", "italian", "spanish", "finnish",
             "swedish", "greek", "estonian", "french", "slovak",
             "hungarian", "slovenian", "croatian", "portuguese", "english",
             "czech", "german"]

countries = ["Poland", "Bulgaria", "Italy", "Spain", "Finland", "Sweden",
       "Latvia", "Germany", "Greece", "Luxembourg", "Estonia", "Belgium",
       "Romania", "France", "Denmark", "Lithuania", "Netherlands",
       "Slovakia", "Hungary", "Slovenia", "Croatia", "Portugal", "Malta",
       "Cyprus", "Ireland", "Czechia", "Austria"]

coun_lan = {
    "Poland": "polish",
    "Bulgaria": "bulgarian",
    "Italy": "italian",
    "Spain": "spanish",
    "Finland": "finnish",
    "Sweden": "swedish",
    "Greece": "greek",
    "Estonia": "estonian",
    "Belgium": "french",
    "Slovakia": "slovak",
    "Hungary": "hungarian",
    "Slovenia": "slovenian",
    "Croatia": "croatian",
    "Portugal": "portuguese",
    "Malta": "english",
    "Czechia": "czech",
    "Austria": "german",
    "Ireland": "english",
    "France": "french",
    "Germany": "german"
}

nw_df = rmurl_df(df, "content")
nw_df = lower_df(nw_df, "content")
nw_df = rmnumbers_df(nw_df, "content")
nw_df = rmpunct_df(nw_df, "content").reset_index()
nw_df["lang"] = nw_df["country"].map(coun_lan)

print("Data cleaned")

def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

nmax = 100000  # maximum number of word embeddings to load
emb_dict = {}
for lang in languages:
    path = f"../../raw_data/vectors_{lang}.txt" #Select here
    embeddings, id2word, word2id = load_vec(path, nmax)
    emb_dict[lang] = [embeddings, id2word, word2id]

print("Dict created")


def multilang_word_vector(word, emb_dict, lang):
    try:
        if word in emb_dict.get(lang)[2].keys():
            return emb_dict[lang][0][emb_dict[lang][2][word]]
        return []
    except:
        return []

nw_df["vectors"] = None
for index, row in nw_df.iterrows():
    res = []
    for i in nw_df["content"][index]:
        res.append(multilang_word_vector(i, emb_dict, nw_df["lang"][index]))
    nw_df["vectors"][index] = res

nw_df = nw_df[nw_df["vectors"].str.len() != 0]
nw_df = nw_df[nw_df["vectors"].str.len() <= 280]

print("Finished")
print(f"{len(nw_df)} tweets vetorised")


df_full imported
Countries selected
Data cleaned
Dict created


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Finished
134724 tweets vetorised


In [65]:
mean_df = nw_df.vectors.apply(lambda x: pd.DataFrame(x).mean())

In [66]:
%%time
res = pd.concat([nw_df[["mep_id", "name", "country", "group", "nat_group", "twitter", "content", "lang"]], mean_df], axis=1)

CPU times: user 80.2 ms, sys: 9.5 ms, total: 89.7 ms
Wall time: 89.1 ms


In [67]:
res

Unnamed: 0,mep_id,name,country,group,nat_group,twitter,content,lang,0,1,...,290,291,292,293,294,295,296,297,298,299
0,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,bardzo dziękuję niezależne od władzy wolne med...,polish,-0.012661,0.002554,...,0.007531,-0.041275,0.029452,0.030989,0.010538,-0.049243,-0.025033,0.058314,0.019128,0.022892
1,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,komisja przyjęła arcyważne projekty pilotażowe...,polish,-0.018363,0.009202,...,0.010615,-0.039067,0.031028,0.025055,0.008500,-0.053187,-0.030836,0.053642,0.019144,0.015897
2,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,komisja przyjęła projekty pilotażowe mojego ws...,polish,-0.019916,0.006455,...,0.005200,-0.037808,0.030887,0.027038,0.009016,-0.054631,-0.028361,0.054603,0.024452,0.018427
3,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,w tym dniu w tym miejscu w tej godzinie proszę...,polish,-0.010152,0.005874,...,0.016721,-0.040829,0.033006,0.023900,0.013247,-0.052095,-0.029642,0.054064,0.021562,0.014025
4,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,bóg nie potrzebuje być przez nikogo broniony i...,polish,-0.017805,0.005459,...,0.012571,-0.039137,0.037016,0.028174,0.013037,-0.056093,-0.035508,0.053920,0.019459,0.015463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137295,197652,Angelika WINZIG,Austria,Group of the European People's Party (Christia...,Österreichische Volkspartei,AngelikaWinzig,italiens budgetpolitik gefährdet nicht nur ita...,german,-0.032662,-0.020359,...,-0.006590,-0.010474,0.038501,0.017754,-0.031848,-0.114873,-0.063869,-0.001768,0.014983,0.010953
137296,197652,Angelika WINZIG,Austria,Group of the European People's Party (Christia...,Österreichische Volkspartei,AngelikaWinzig,danke martin für die organisation des eu somme...,german,-0.035984,-0.029438,...,-0.009227,-0.008789,0.036854,0.021920,-0.032733,-0.112913,-0.060042,-0.002471,0.018086,0.008367
137297,197652,Angelika WINZIG,Austria,Group of the European People's Party (Christia...,Österreichische Volkspartei,AngelikaWinzig,tadition amp moderne top betriebe netzwerktref...,german,-0.028948,-0.023442,...,-0.006545,-0.011271,0.042163,0.016308,-0.039746,-0.117086,-0.060991,-0.004830,0.018593,0.007534
137298,197652,Angelika WINZIG,Austria,Group of the European People's Party (Christia...,Österreichische Volkspartei,AngelikaWinzig,hier mein statement zur budgetrede von finanzm...,german,-0.034614,-0.024161,...,-0.010266,-0.012133,0.036686,0.015777,-0.028820,-0.107009,-0.058627,0.000139,0.018164,0.005121


In [68]:
res.to_hdf("full_mean_df_hdf.h5", key="h5")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  f(store)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['mep_id', 'name', 'country', 'group', 'nat_group', 'twitter', 'content',
       'lang'],
      dtype='object')]

  encoding=encoding,
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block1_items] [items->None]

  f(store)


In [12]:
nw_df[:int(len(nw_df)/100)].to_csv("test_5.csv")

In [50]:
%%time
res = []
for row in nw_df.itertuples()[:int(len(nw_df)/100)]:
    res.append(pd.DataFrame(row))

CPU times: user 1min 49s, sys: 601 ms, total: 1min 49s
Wall time: 1min 50s
