# Prétraitement des données

## Suppression des ponctuations indésirables

In [54]:
# Imports nécessaires
import pandas as pd
import numpy as np
import string 
import re

In [55]:
# Lecture et stockage de la base de données
tweet_df = pd.read_csv('../../delphes/data/final2_clean.csv', index_col=0)
tweet_df.head()

Unnamed: 0,mep_id,name,country,group,nat_group,twitter,content
0,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,"['W tym dniu, w tym miejscu, w tej godzinie pr..."
1,189525,Asim ADEMOV,Bulgaria,Group of the European People's Party (Christia...,Citizens for European Development of Bulgaria,AdemovAsim,['RT @ECinBulgaria: 📢 Остана 1⃣ седмица! Преди...
2,124831,Isabella ADINOLFI,Italy,Non-attached Members,Movimento 5 Stelle,Isa_Adinolfi,"[""Sembra un film, ma purtroppo è realtà: le im..."
6,125045,Clara AGUILERA,Spain,Group of the Progressive Alliance of Socialist...,Partido Socialista Obrero Español,ClaraAguilera7,['RT @ClaraAguilera7: Debate e importantes vot...
7,204335,Alviina ALAMETSÄ,Finland,Group of the Greens/European Free Alliance,Vihreä liitto,alviinaalametsa,['Toimeentulotukea korotetaan 75e koronakriisi...


In [56]:
# Remove the undesirable elements in the entire dataframe
def rmurl_df(df, column_name):
    '''
    This function removes all the URLs, the #hashtag and the @user of a column made of strings.
    Be careful to apply it BEFORE all the other preprocessing steps (if not it wont'
    be recognized as a URL)
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.replace('http\S+|www.\S+|@\S+|#\S+', '', case=False)
    return df



In [57]:
# Lowercase the tweet's column
def lower_df(df, column_name):
    '''
    This function lowercases a column made of strings.
    '''
    df = df.copy()
    df[column_name] = df[column_name].str.lower()
    return df

In [58]:
# Remove the numbers in the tweet's column
def rmnumbers_df(df, column_name):
    '''
    This function removes all the digits of a column made of strings.
    '''
    df = df.copy()
    def remove_numbers(text):
        return ''.join(word for word in text if not word.isdigit())
    df[column_name] = df[column_name].apply(remove_numbers)
    return df

In [65]:
# Remove the undesirable punctuations in the tweet's column
def rmpunct_df(df, column_name):
    '''
    This function removes all the punctuations, all the "rt" and remove multiple spaces
    of a column made of strings.
    '''
    punct = string.punctuation
    df = df.copy()
    def replace_punct(text):
        for punctu in punct:
            text = text.replace(punctu, ' ')
            text = text.replace(' rt ','')
            text = " ".join(text.split())
        return text
    df[column_name] = df[column_name].apply(replace_punct)
    return df

In [66]:
# Remove the undesirable emojis in the entire dataframe
def rmemojis_df(df):
    '''
    This function removes all the emojis of a column made of strings.
    Be careful to translate in latin alphabet before applying this function : 
    it also removes cyrillic alphabet
    '''
    df = df.copy()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    return df

In [67]:
clean_df = rmurl_df(tweet_df, 'content')
clean_df = lower_df(clean_df, 'content')
clean_df = rmnumbers_df(clean_df, 'content')
clean_df = rmpunct_df(clean_df, 'content')
clean_df = rmemojis_df(clean_df)

In [68]:
clean_df[test_df['country'] == 'Ireland']

Unnamed: 0,mep_id,name,country,group,nat_group,twitter,content
136,124988,Deirdre CLUNE,Ireland,Group of the European People's Party (Christia...,Fine Gael Party,deirdreclunemep,the guidance on preparing for the end of the t...
146,197654,Ciarn CUFFE,Ireland,Group of the Greens/European Free Alliance,Green Party,ciarancuffe,it is crucial who follows phil hogan his repla...
208,197720,Frances FITZGERALD,Ireland,Group of the European People's Party (Christia...,Fine Gael Party,FitzgeraldFrncs,leaving london after talks w will continue th...
209,124985,Luke Ming FLANAGAN,Ireland,Group of the European United Left - Nordic Gre...,Independent,lukeming,the meat sector truly do not give a toss n nit...
335,96668,Sen KELLY,Ireland,Group of the European People's Party (Christia...,Fine Gael Party,SeanKellyMEP,eu member sates should reinstate the solvency ...
404,28115,Mairead McGUINNESS,Ireland,Group of the European People's Party (Christia...,Fine Gael Party,MaireadMcGMEP,happening this morning at am edt join us durin...
405,205452,Chris MACMANUS,Ireland,Group of the European United Left - Nordic Gre...,Sinn Fin,macmanuschris,the decision by electric ireland to hike its e...
478,197889,Grace O'SULLIVAN,Ireland,Group of the Greens/European Free Alliance,Green Party,graceosllvn,thought id bring a fresh new look to brussels ...
668,197817,Mick WALLACE,Ireland,Group of the European United Left - Nordic Gre...,Independents for change,wallacemick,there s huge frustration with lack of consiste...
669,197863,Maria WALSH,Ireland,Group of the European People's Party (Christia...,Fine Gael Party,mariawalsheu,congratulations lady amazing news great post ...


In [69]:
clean_df['content'][136]

'the guidance on preparing for the end of the transition period with sector specific information many irish importers and exporters rely on the uk as a key route a land bridge to and from eu says worried amp disappointed by the failure to engage on what he terms fair play or level playing field uk the withdrawal agreement is only way to protect good friday agreement in all its dimensions and therefore to protect t the public consultation is open until september th this is your chance to and give in the digital services act will be important in amp as i outlined today in my latest in by allowing the parochialism of irelands domestic politics to dominate its engagement in europ  farm to fork is a new eu strategy to protect biodiversity and reduce pesticides and food waste making europe s food very welcome news for pubs  n nwe are worried about the state of play of the negotiations with we do not see how we can have a better agree the eu is working continuously to find ways of ensuring a 

In [53]:
tweet_df['content'][136]

'[\'The @EU_Commission guidance on preparing for the end of the transition period, with sector specific information https://t.co/aHB3lwzQsx\', \'Many Irish importers and exporters rely on the UK as a key route, a land bridge to and from EU @MichelBarnier says… https://t.co/2SMROKEe7q\', \'.@MichelBarnier worried&amp;disappointed by the failure to engage on what he terms fair play or level playing field. UK… https://t.co/o3aScFy7lV\', \'RT @eurireland: The Withdrawal Agreement ... is only way to protect Good Friday Agreement in all its dimensions. And therefore to protect t…\', \'The @EU_Commission public consultation is open until September 8th, this is your chance to #haveyoursay and give in… https://t.co/ucDGV6DxVB\', \'The Digital Services Act will be important in #transport &amp; #tourism, as I outlined today in @EP_Transport… https://t.co/iZ874cgtGD\', \'RT @EoinDrea: My latest in @IrishTimesOpEd  By allowing the parochialism of Ireland’s domestic politics to dominate its engageme