In [1]:
import random
import time
import re
import pickle
import os
import string
from typing import Dict
import html
import json

import numpy as np
import pandas as pd

# Pre processing
## keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
## nltk
import nltk
from nltk.corpus import words, stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer

nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from urllib.parse import urljoin

In [3]:
RAW_PATH = 'data/1_raw/'
TOOLS_PATH = 'data/tools/'

## Chargement des données

In [4]:
DATASET_COLUMNS = ["label", "id", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

df = pd.read_csv(RAW_PATH + 'sentiment140/training.1600000.processed.noemoticon.csv',
                 encoding =DATASET_ENCODING,
                 names=DATASET_COLUMNS)


In [5]:
contractions = pd.read_csv(TOOLS_PATH + 'contractions.csv',
                           index_col='Contraction')
contractions.index = contractions.index.str.lower()
contractions.Meaning = contractions.Meaning.str.lower()
contractions_dict = contractions.to_dict()['Meaning']
del contractions_dict['s']

In [6]:
# emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
#           ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
#           ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
#           ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
#           '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
#           '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
#           ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

with open(TOOLS_PATH + 'emojis.json', 'r') as file:
    emojis = json.load(file)

In [7]:
df['label'] = df['label'].replace(4,1)

In [8]:
print("Dataset size:", len(df))

Dataset size: 1600000


In [9]:
df.head(5)

Unnamed: 0,label,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [10]:
df.sample(5)

Unnamed: 0,label,id,date,flag,user,text
1042015,1,1957148701,Thu May 28 23:39:18 PDT 2009,NO_QUERY,kassymay,it's friday gonna go force myself to study ho...
1597803,1,2193029327,Tue Jun 16 07:53:56 PDT 2009,NO_QUERY,allora,@jumpsun you'll rarely see me without a smile ...
139891,0,1880908014,Fri May 22 02:55:56 PDT 2009,NO_QUERY,AlexaGrace16,@birdb went to bed early missed u on msn ta...
98997,0,1793477150,Thu May 14 02:50:43 PDT 2009,NO_QUERY,WiseTC,i have to catch a bus today
364463,0,2048169043,Fri Jun 05 14:46:13 PDT 2009,NO_QUERY,UKSolarCar,"So we had a flat tire, but we were able to dri..."


# 1. Traitements simples

## 1.1. Doublons

On a vu qu'il y a 1685 tweets répétés deux fois avec une version avec le label 0, et une version avec le label 4.

In [11]:
df.shape

(1600000, 6)

In [12]:
duplicates = df[df.duplicated(subset=['id'], keep=False)]

# Filtrer le DataFrame pour exclure les doublons
df = df[~df['id'].isin(duplicates['id'])]

In [13]:
df.shape

(1596630, 6)

## 1.2. Selection des variables

In [14]:
tweets_df = df[['label','text']]

## 1.3. Echantillonnage équilibré

In [15]:
def balanced_sample_df(df, labels, size, random_state=None):
    new_df = pd.DataFrame()
    for label in labels:
        new_df = pd.concat([new_df, df[df.label==label].sample(size//len(labels), random_state=random_state)])
    return new_df.reset_index(drop=True)

In [16]:
# SAMPLE_SIZE = 1600
# tweets_df = balanced_sample_df(tweets_df, [0, 1], SAMPLE_SIZE, random_state=0)

# 2. Préparation des données textuelles

## 2.1. Substitutions

On effectue dans un premier temps un certain nombre de subsitutions :

1. **Remplacement des URLs:** les liens débutant par **'http' or 'https' or 'www'** sont remplacés par **'<url\>'**.
2. **Remplacement des utilisateurs:** on remplace les @Usernamespar le mot **'<user\>'**. ['@Kaggle' to '<user\>'].
3. **Remplacement des lettres consécutives:** 3 or more consecutive letters are replaced by 2 letters. ['Heyyyy' to 'Heyy']
4. **Remplacement des Emojis:** on remplace les emojis par leur sens. [':)' to '<smile\>']
5. **Remplacement des Contractions:**: on remplace les contractions par leur forme développée. ["can't" to 'can not']
6. **Remplacement des caractères spéciaux:** on remplace les caractères qui ne sont pas des chiffres, lettres, caractères prédéfinis par un espace.


In [17]:
def substitution(text: str, replacements: Dict[str, str]) -> str:
    text = text.lower()
    for contraction, replacement in replacements.items():
        text = text.replace(contraction, replacement)   
    return text

def replace_html_entities(text: str) -> str:
    return html.unescape(text)

def substitute_url(text: str, replacement: str = 'url') -> str:
    text = re.sub(r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)", replacement, text)
    return text
    
def substitute_user(text: str, replacement: str = 'user') -> str:
    text = re.sub('@[^\s]+', replacement, text)
    return text

def filter_non_alphabet(text: str) -> str:
    text = re.sub("[^a-zA-Z0-9]", " ", text)
    return text
    
def replace_three_same_letters(text: str) -> str:
    text = re.sub(r"(.)\1\1+", r"\1\1", text)
    return text

In [20]:
def substitute_text(text):
    text = text.lower()
    text = replace_html_entities(text)
    text = substitution(text, contractions_dict)
    text = substitution(text, emojis)
    text = substitute_url(text)
    text = substitute_user(text)
    text = filter_non_alphabet(text)
    text = replace_three_same_letters(text)
    return text

**Tests**

In [18]:
text_with_entities = "This is an example &amp; text with &lt;HTML&gt; entities."
processed_text = replace_html_entities(text_with_entities)
processed_text

'This is an example & text with <HTML> entities.'

In [19]:
k = random.randint(0, len(tweets_df.text))
print(f"Preprocessing steps for tweet id={k}\n")
tweet = tweets_df.text.iloc[k].lower()

print('0.', tweet, "\n")

tweet = substitution(tweet, contractions_dict)
print('1', tweet, "\n")

tweet = replace_html_entities(tweet)
print('2.', tweet, "\n")

tweet = substitution(tweet, emojis)
print('3.', tweet, "\n")

tweet = substitute_url(tweet)
print('4.', tweet, "\n")

tweet = substitute_user(tweet)
print('5.', tweet, "\n")

tweet = filter_non_alphabet(tweet)
print('6.', tweet, "\n")

tweet = replace_three_same_letters(tweet)
print('7.', tweet, "\n")

Preprocessing steps for tweet id=623132

0. i've been conscripted to a work politics  war i don't want to fight in. why can't i be switzerland  

1 i have been conscripted to a work politics  war i do not want to fight in. why cannot i be switzerland  

2. i have been conscripted to a work politics  war i do not want to fight in. why cannot i be switzerland  

3. i have been conscripted to a work politics  war i do not want to fight in. why cannot i be switzerland  

4. i have been conscripted to a work politics  war i do not want to fight in. why cannot i be switzerland  

5. i have been conscripted to a work politics  war i do not want to fight in. why cannot i be switzerland  

6. i have been conscripted to a work politics  war i do not want to fight in  why cannot i be switzerland  

7. i have been conscripted to a work politics  war i do not want to fight in  why cannot i be switzerland  



In [21]:
substitute_text(tweets_df.text[k])

'man my signal sux downstairs  on my way back upstairs  brb '

**Application sur les données**

In [22]:
%%time
text_substituted = [substitute_text(tweet) for tweet in tweets_df.text]

CPU times: total: 1min 2s
Wall time: 1min 2s


In [23]:
text_substituted[:10]

['user url  aww  that is a bummer  you shoulda got david carr of third day to do it  wink',
 'is upset that he cannot update his facebook by texting it  and might cry as a result  school today also  blah ',
 'user i dived many times for the ball  managed to save 50  the rest go out of bounds',
 'my whole body feels itchy and like its on fire ',
 'user no  it is not behaving at all  i am mad  why am i here  because i cannot see you all over there  ',
 'user not the whole crew ',
 'need a hug ',
 'user hey  long time no see  yes  rains a bit  only a bit  lol  i am fine thanks  how is you  ',
 'user nope they did not have it ',
 'user que me muera  ']

## 2.2. Tokenisation

In [26]:
def tokenize_text(text, filtered_words, stem_or_lem=''):
    raw_tokens_list = word_tokenize(text)
    tokens_list = []
    if stem_or_lem in ["stem", 'lem']:
        for token in raw_tokens_list:
            if token in filtered_words:
                continue
            if stem_or_lem == "stem":
                stemmer = PorterStemmer()
                tokens_list.append(stemmer.stem(token))
            else:
                lemmatizer = WordNetLemmatizer()
                tokens_list.append(lemmatizer.lemmatize(token))
    else:
        tokens_list = raw_tokens_list
    return tokens_list

In [27]:
filtered_words = stopwords.words('english')

In [28]:
%%time
tweets_df['tokens'] = [tokenize_text(tweet, filtered_words=filtered_words) for tweet in text_substituted]

CPU times: total: 2min 47s
Wall time: 2min 47s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [29]:
%%time
tweets_df['tokens_lem'] = [tokenize_text(tweet, filtered_words=filtered_words) for tweet in text_substituted]

CPU times: total: 2min 28s
Wall time: 2min 28s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [30]:
%%time
tweets_df['tokens_stem'] = [tokenize_text(tweet, filtered_words=filtered_words, stem_or_lem='stem') for tweet in text_substituted]

CPU times: total: 8min 47s
Wall time: 8min 49s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# 3. Exports

Pour des gains de place, on exporte les données sous forme de texte plutot que de tokens

In [33]:
tweets_df['clean'] = tweets_df['tokens'].apply(lambda x: " ".join(x))
tweets_df['clean_lem'] = tweets_df['tokens_lem'].apply(lambda x: " ".join(x))
tweets_df['clean_stem'] = tweets_df['tokens_stem'].apply(lambda x: " ".join(x))

In [34]:
PROJECT_PATH = os.getcwd()[2:]
PROJECT_PATH = PROJECT_PATH.replace("\\", "/") + '/'

In [35]:
DATA_PATH = 'data/2_preprocessed/'
full_path = urljoin(PROJECT_PATH, DATA_PATH)
full_path

'/Users/USER/PycharmProjects/2_OC_IA/sentiment_analysis/src/data/2_preprocessed/'

In [36]:
if not os.path.exists(full_path):
    os.makedirs(full_path)

In [None]:
# df_file = f'tweets_tokens_df_{str(tweets_df.shape[0])}.csv'
# df_file

# tweets_df[['label', 'text', 'tokens', 'tokens_lem', 'tokens_stem']].to_csv(urljoin(full_path, df_file))

In [37]:
df_file = f'tweets_join_df_{str(tweets_df.shape[0])}.csv'
df_file

tweets_df[['label', 'text', 'clean', 'clean_lem', 'clean_stem']].to_csv(urljoin(full_path, df_file))