# Prétraitement des données

## Suppression des éléments indésirables de nos tweets

In [438]:
# Imports nécessaires
import pandas as pd
import numpy as np
import string 
import re
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

In [55]:
# Lecture et stockage de la base de données
tweet_df = pd.read_csv('../../delphes/data/final2_clean.csv', index_col=0)
tweet_df.head()

Unnamed: 0,mep_id,name,country,group,nat_group,twitter,content
0,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,"['W tym dniu, w tym miejscu, w tej godzinie pr..."
1,189525,Asim ADEMOV,Bulgaria,Group of the European People's Party (Christia...,Citizens for European Development of Bulgaria,AdemovAsim,['RT @ECinBulgaria: 📢 Остана 1⃣ седмица! Преди...
2,124831,Isabella ADINOLFI,Italy,Non-attached Members,Movimento 5 Stelle,Isa_Adinolfi,"[""Sembra un film, ma purtroppo è realtà: le im..."
6,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Español,ClaraAguilera7,['RT @ClaraAguilera7: Debate e importantes vot...
7,204335,Alviina ALAMETSÄ,Finland,Group of the Greens/European Free Alliance,Vihreä liitto,alviinaalametsa,['Toimeentulotukea korotetaan 75e koronakriisi...


In [56]:
# Remove the undesirable elements in the entire dataframe
def rmurl_df(df, column_name):
    '''
    This function removes all the URLs, the #hashtag and the @user of a column made of strings.
    Be careful to apply it BEFORE all the other preprocessing steps (if not it wont'
    be recognized as a URL)
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    return df



In [57]:
# Lowercase the tweet's column
def lower_df(df, column_name):
    '''
    This function lowercases a column made of strings.
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.lower()
    return df

In [58]:
# Remove the numbers in the tweet's column
def rmnumbers_df(df, column_name):
    '''
    This function removes all the digits of a column made of strings.
    '''
    df = df.copy()
    def remove_numbers(text):
        return ''.join(word for word in text if not word.isdigit())
    df[column_name] = df[column_name].apply(remove_numbers)
    return df

In [517]:
# Remove the undesirable punctuations in the tweet's column
def rmpunct_df(df, column_name):
    '''
    This function removes all the punctuations, all the "rt" and remove multiple spaces
    of a column made of strings.
    '''
    punct = string.punctuation
    df = df.copy()
    def replace_punct(text):
        for punctu in punct:
            text = text.replace(punctu, ' ')
        text = text.replace(' rt ','')
        text = " ".join(text.split())
        return text
    df[column_name] = df[column_name].apply(replace_punct)
    return df

In [573]:
# Remove the stopwords in the tweet's column
def rmstopwords_df(df, column_name):
    '''
    This function removes all the stopwords of a column made of strings.
    '''
    df = df.copy()
    stop_words = stopwords.words('english')
    def remove_stopwords(text):
        for word in stop_words:
            text = text.replace(f' {word} ', ' ')
        return text
    df[column_name] = df[column_name].apply(remove_stopwords)
    return df

In [574]:
# Remove the undesirable emojis in the entire dataframe
def rmemojis_df(df):
    '''
    This function removes all the emojis of a column made of strings.
    Be careful to translate in latin alphabet before applying this function : 
    it also removes cyrillic alphabet
    '''
    df = df.copy()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    return df

In [577]:
clean_df = rmurl_df(tweet_df, 'content')
clean_df = lower_df(clean_df, 'content')
clean_df = rmnumbers_df(clean_df, 'content')
clean_df = rmpunct_df(clean_df, 'content')
clean_df = rmstopwords_df(clean_df, 'content')
clean_df = rmemojis_df(clean_df)

In [578]:
clean_df[clean_df['country'] == 'Ireland'].head()

Unnamed: 0,mep_id,name,country,group,nat_group,twitter,content
136,124988,Deirdre CLUNE,Ireland,Group of the European People's Party (Christia...,Fine Gael Party,deirdreclunemep,the guidance preparing end transition period s...
146,197654,Ciarn CUFFE,Ireland,Group of the Greens/European Free Alliance,Green Party,ciarancuffe,it crucial follows phil hogan replacement big ...
208,197720,Frances FITZGERALD,Ireland,Group of the European People's Party (Christia...,Fine Gael Party,FitzgeraldFrncs,leaving london talks w continue difficult neg...
209,124985,Luke Ming FLANAGAN,Ireland,Group of the European United Left - Nordic Gre...,Independent,lukeming,the meat sector truly give toss n nit would co...
335,96668,Sen KELLY,Ireland,Group of the European People's Party (Christia...,Fine Gael Party,SeanKellyMEP,eu member sates reinstate solvency support ins...


In [579]:
testbase_df = clean_df[clean_df['country'] == 'Ireland']

## Premier test Word2Vec

In [580]:
testbase_df = testbase_df[['mep_id', 'content']]

In [581]:
from gensim.models import Word2Vec

In [582]:
sentences = testbase_df['content']

In [602]:
sentences_train = []
for sentence in sentences:
    sentences_train.append(sentence.split())

In [603]:
word2vec = Word2Vec(sentences=sentences_train)

In [604]:
def embed_sentence(word2vec, sentence):
    y = []
    for word in sentence:
        if word in word2vec.wv.vocab.keys():
           y.append(word2vec[word])
    return np.array(y)

def embedding(word2vec, sentences):
    
    y = []
    for sentence in sentences:
        y.append(embed_sentence(word2vec, sentence))
    return y

In [605]:
X_train = embedding(word2vec,sentences_train)

  """


In [606]:
from tensorflow.keras.utils import to_categorical
y_train = np.arange(0,10,1)
y_cat_train = to_categorical(y_train, num_classes=10)

In [607]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

def init_model():

    model = Sequential()
    model.add(layers.Masking(mask_value = -1000))
    model.add(layers.LSTM(13))
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(10, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop', 
              metrics=['accuracy'])
    return model

model = init_model()

In [608]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_pad = pad_sequences(X_train, padding='post',value=-1000, dtype='float32')

In [609]:
X_train_pad.shape

(10, 2319, 100)

In [610]:
y_cat_train.shape

(10, 10)

In [613]:
model = init_model()
model.fit(X_train_pad, y_cat_train, epochs=100)

Train on 10 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
E

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x26511230d48>

In [620]:
examples2 = ['morning Deputy Dara Calleary tendered resignation Minister Agriculture Food Marine following attendance Oireachtas golf dinner Wednesday evening attendance event wrong error judgement part accepted resignation People country have made difficult personal sacrifices family lives businesses comply COVID-19 regulations event should gone ahead manner given Government decision last Tuesday Dara Calleary since first elected Dail Eireann remains committed dedicated public representative error judgement character made right decision country particularly light continued efforts suppress COVID-19',
            'guidance anger detect deputy']

In [621]:
examples2[0] = examples2[0].lower()

In [622]:
examples=[]
for example in examples2:
    examples.append(example.split())

In [623]:
examples[1]

['guidance', 'anger', 'detect', 'deputy']

In [624]:
X_test = embedding(word2vec, examples)
X_test_pad = pad_sequences(X_test, padding='post',value=-1000, dtype='float32')

  """


In [625]:
model.predict(X_test_pad)

array([[0.10833959, 0.08891478, 0.07604273, 0.12035228, 0.10818799,
        0.09217536, 0.08763052, 0.10064937, 0.10162988, 0.11607757],
       [0.10745704, 0.10042856, 0.10160366, 0.08884913, 0.09296093,
        0.1151541 , 0.08331128, 0.11196935, 0.11317366, 0.08509227]],
      dtype=float32)