In [51]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None
nltk.download('stopwords')

full_df = pd.read_csv("/content/twcs.csv", nrows=5000)
df = full_df[["text"]]
df["text"] = df["text"].astype(str)
full_df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [53]:
#Mise en minuscules
df["text_cleaned"] = df["text"].str.lower()
df.head()

Unnamed: 0,text,text_lower,text_cleaned
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,@115712 i understand. i would like to assist y...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,@sprintcare i have sent several private messag...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,@115712 please send us a private message so th...
4,@sprintcare I did.,@sprintcare i did.,@sprintcare i did.


In [54]:
#Suppression des ponctuations
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text_cleaned"] = df["text"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,text,text_lower,text_cleaned
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like to assist you...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I have sent several private message...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us a Private Message so tha...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I did


In [55]:
#Suppression des mots vides
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text_cleaned"] = df["text_cleaned"].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,text,text_lower,text_cleaned
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like assist We wou...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare propose
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I sent several private messages one...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us Private Message assist J...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I


In [56]:
#Suppression des mots fréquents
from collections import Counter
cnt = Counter()
for text in df["text_cleaned"].values:
    for word in text.split():
        cnt[word] += 1

cnt.most_common(10)

[('I', 1437),
 ('us', 752),
 ('DM', 514),
 ('help', 479),
 ('Please', 376),
 ('We', 338),
 ('Hi', 293),
 ('Thanks', 287),
 ('get', 279),
 ('please', 247)]

In [57]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["text_cleaned"] = df["text_cleaned"].apply(lambda text: remove_freqwords(text))
df.head()

Unnamed: 0,text,text_lower,text_cleaned
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 understand would like assist would need...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare propose
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare sent several private messages one r...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 send Private Message assist Just click ...
4,@sprintcare I did.,@sprintcare i did.,sprintcare


In [58]:
#Suppression des mots rares
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["text_cleaned"] = df["text_cleaned"].apply(lambda text: remove_rarewords(text))
df.head()

Unnamed: 0,text,text_lower,text_cleaned
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 understand would like assist would need...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare propose
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare sent several private messages one r...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 send Private Message assist Just click ...
4,@sprintcare I did.,@sprintcare i did.,sprintcare


In [59]:
#Stemming
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["text_cleaned"] = df["text_cleaned"].apply(lambda text: stem_words(text))
df.head()

Unnamed: 0,text,text_lower,text_cleaned
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 understand would like assist would need...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcar propos
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcar sent sever privat messag one respond...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 send privat messag assist just click ‘m...
4,@sprintcare I did.,@sprintcare i did.,sprintcar


In [60]:
#Stemming
import spacy

nlp = spacy.load("en_core_web_sm")

def lemmatize_words(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

df["text_cleaned"] = df["text_cleaned"].apply(lambda text: lemmatize_words(text))
df.head()

Unnamed: 0,text,text_lower,text_cleaned
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 understand would like assist would need...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcar propos
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcar send sever privat messag one respond...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 send privat messag assist just click ' ...
4,@sprintcare I did.,@sprintcare i did.,sprintcar


In [61]:
#Suppression des émojis , Suppression des émoticônes
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

df["text_cleaned"] = df["text_cleaned"].apply(lambda text: remove_emoji(text))
df.head()


Unnamed: 0,text,text_lower,text_cleaned
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 understand would like assist would need...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcar propos
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcar send sever privat messag one respond...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 send privat messag assist just click ' ...
4,@sprintcare I did.,@sprintcare i did.,sprintcar


In [62]:
#Suppression des URL
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
df["text_cleaned"] = df["text_cleaned"].apply(lambda text: remove_urls(text))
df.head()

Unnamed: 0,text,text_lower,text_cleaned
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 understand would like assist would need...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcar propos
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcar send sever privat messag one respond...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 send privat messag assist just click ' ...
4,@sprintcare I did.,@sprintcare i did.,sprintcar


In [63]:
#Suppression des balises HTML
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)
df["text_cleaned"] = df["text_cleaned"].apply(lambda text: remove_html(text))
df.head()

Unnamed: 0,text,text_lower,text_cleaned
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 understand would like assist would need...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcar propos
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcar send sever privat messag one respond...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 send privat messag assist just click ' ...
4,@sprintcare I did.,@sprintcare i did.,sprintcar


In [64]:
df

Unnamed: 0,text,text_lower,text_cleaned
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 understand would like assist would need...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcar propos
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcar send sever privat messag one respond...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 send privat messag assist just click ' ...
4,@sprintcare I did.,@sprintcare i did.,sprintcar
...,...,...,...
4995,"@117290 Hm, they should definitely resume if a...","@117290 hm, they should definitely resume if a...",117290 hm definit resum alreadi enabl when dis...
4996,"@117290 That's where they need to be on, so ch...","@117290 that's where they need to be on, so ch...",117290 that need check home alway happen 22
4997,@115940 Can you guys maybe like....FIX your st...,@115940 can you guys maybe like....fix your st...,115940 can guy mayb stuff for time last one ep...
4998,@117291 Hulu is only available in the U.S. rig...,@117291 hulu is only available in the u.s. rig...,hulu avail we right well sure share interest o...
