In [3]:
import pandas as pd
import numpy as np
import io
from googletrans import Translator, LANGUAGES
from time import sleep
import string

df_full = pd.read_pickle("../../delphes/data/extended_tweet_df")
print("df_full imported")

df = df_full[df_full["country"] == "Poland"]
for i in ["Bulgaria", "Italy", "Spain", "Finland", "Sweden", "Latvia", "Germany", "Greece", "Luxembourg", "Estonia", "Belgium", "Romania", "France", "Denmark", "Lithuania", "Netherlands", "Slovakia", "Hungary", "Slovenia", "Croatia", "Portugal", "Malta", "Cyprus", "Ireland", "Czechia", "Austria"]:
    df = pd.concat([df, df_full[df_full["country"] == i]])
print("Countries selected")
    
def rmurl_df(df, column_name):
    '''
    This function removes all the URLs, the #hashtag and the @user of a column made of strings.
    Be careful to apply it BEFORE all the other preprocessing steps (if not it wont'
    be recognized as a URL)
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    return df

def lower_df(df, column_name):
    '''
    This function lowercases a column made of strings and return the dataframe.
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.lower()
    return df

def rmnumbers_df(df, column_name):
    '''
    This function removes all the digits of a column made of strings.
    '''
    df = df.copy()
    def remove_numbers(text):
        return ''.join(word for word in text if not word.isdigit())
    df[column_name] = df[column_name].apply(remove_numbers)
    return df

def rmpunct_df(df, column_name):
    '''
    This function removes all the punctuations, all the "rt" and remove multiple spaces
    of a column made of strings.
    '''
    punct = string.punctuation
    df = df.copy()
    def replace_punct(text):
        for punctu in punct:
            text = text.replace(punctu, ' ')
            text = text.replace(' rt ','')
            text = " ".join(text.split())
        return text
    df[column_name] = df[column_name].apply(replace_punct)
    return df

def rmemojis_df(df):
    '''
    This function removes all the emojis of a column made of strings.
    Be careful to translate in latin alphabet before applying this function :
    it also removes cyrillic alphabet.
    '''
    df = df.copy()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    return df


def erase_fewletter_df(df, column_name):
    '''
    One or two letters words are deleted from the dataset.
    '''
    df = df.copy()
    def tester(text):
        text = ' '.join( [w for w in text.split() if len(w)>2] )
        return text
    df[column_name] = df[column_name].apply(tester)
    return df



languages = ["polish", "bulgarian", "italian", "spanish", "finnish",
             "swedish", "greek", "estonian", "french", "slovak",
             "hungarian", "slovenian", "croatian", "portuguese", "english",
             "czech", "german"]

countries = ["Poland", "Bulgaria", "Italy", "Spain", "Finland", "Sweden",
       "Latvia", "Germany", "Greece", "Luxembourg", "Estonia", "Belgium",
       "Romania", "France", "Denmark", "Lithuania", "Netherlands",
       "Slovakia", "Hungary", "Slovenia", "Croatia", "Portugal", "Malta",
       "Cyprus", "Ireland", "Czechia", "Austria"]

coun_lan = {
    "Poland": "polish",
    "Bulgaria": "bulgarian",
    "Italy": "italian",
    "Spain": "spanish",
    "Finland": "finnish",
    "Sweden": "swedish",
    "Greece": "greek",
    "Estonia": "estonian",
    "Belgium": "french",
    "Slovakia": "slovak",
    "Hungary": "hungarian",
    "Slovenia": "slovenian",
    "Croatia": "croatian",
    "Portugal": "portuguese",
    "Malta": "english",
    "Czechia": "czech",
    "Austria": "german",
    "Ireland": "english",
    "France": "french",
    "Germany": "german"
}

nw_df = rmurl_df(df, "content")
nw_df = lower_df(nw_df, "content")

nw_df = erase_fewletter_df(nw_df, "content")

nw_df = rmnumbers_df(nw_df, "content")
nw_df = rmpunct_df(nw_df, "content").reset_index()
nw_df["lang"] = nw_df["country"].map(coun_lan)

print("Data cleaned")

def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

nmax = 100000  # maximum number of word embeddings to load
emb_dict = {}
for lang in languages:
    path = f"../../raw_data/vectors_{lang}.txt" #Select here
    embeddings, id2word, word2id = load_vec(path, nmax)
    emb_dict[lang] = [embeddings, id2word, word2id]

print("Dict created")


def multilang_word_vector(word, emb_dict, lang):
    try:
        if word in emb_dict.get(lang)[2].keys():
            return emb_dict[lang][0][emb_dict[lang][2][word]]
        return []
    except:
        return []

nw_df["vectors"] = None
for index, row in nw_df.iterrows():
    res = []
    for i in nw_df["content"][index]:
        res.append(multilang_word_vector(i, emb_dict, nw_df["lang"][index]))
    nw_df["vectors"][index] = res

nw_df = nw_df[nw_df["vectors"].str.len() != 0].copy()
nw_df = nw_df[nw_df["vectors"].str.len() <= 280].copy()

print("Finished")
print(f"{len(nw_df)} tweets vetorised")


df_full imported
Countries selected
Data cleaned
Dict created


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Finished
134228 tweets vetorised


In [4]:
mean_df = nw_df.vectors.apply(lambda x: pd.DataFrame(x).mean())

KeyboardInterrupt: 

In [None]:
%%time
res = pd.concat([nw_df[["mep_id", "name", "country", "group", "nat_group", "twitter", "content", "lang"]], mean_df], axis=1)

In [None]:
res

In [None]:
res.to_hdf("full_mean_df_hdf.h5", key="h5")

In [None]:
nw_df[:int(len(nw_df)/100)].to_csv("test_5.csv")

In [None]:
%%time
res = []
for row in nw_df.itertuples()[:int(len(nw_df)/100)]:
    res.append(pd.DataFrame(row))