In [1]:
import pandas as pd
import re
import string
from unicodedata import normalize
from pickle import dump
import os

## Loading, cleaning and saving data

In [2]:
BASE_DIR = "/Users/sghenimi/SANDBOX/ScratchML/2019-04-16-nlp_sarcasm"
os.chdir(BASE_DIR)

In [3]:
# on charge les documents en mémoire
def to_load_doc_json(filename):
    data = pd.read_json(filename, lines=True)
    data = data.drop("article_link", axis=1)
    return data

In [4]:
# Nettoyer une liste de lignes
def to_clean(data):
    data_clean = pd.DataFrame(columns=data.columns)
    # On prépare la focntion regex pour filtrer certains caractères
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # preparer la table de traduction pour retirer la ponctuation
    table = str.maketrans('', '', string.punctuation)
    for index, line in data.iterrows():
        text = line[0]
        # on normalise les caractères en ascii
        text = normalize('NFD', text).encode('ascii', 'ignore')
        text = text.decode('UTF-8')
        # on produit des tokens de mots en utilisant les espaces comme séparateurs
        text = text.split()
        # On passe en minuscule
        text = [word.lower() for word in text]
        # on enleve la ponctuation de chaque token
        text = [word.translate(table) for word in text]
        # on supprime tous les caractères non imprimables
        text = [re_print.sub('', w) for w in text]
        # On supprime les tokens contenant des chiffres
        text = [word for word in text if word.isalpha()]
		# enfin on enregistre les token comme des chaines de caractères
        data_clean = data_clean.append({
                    data.columns[0]:' '.join(text),
                    data.columns[1]:line[1]
                }, ignore_index=True)
    return data_clean

In [5]:
# on sauvegarde un fichier de phrases nettoyées
def to_save_cleaned_data(data, filename):
    dump(data, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [6]:
data = to_load_doc_json("data/data-raw.json")

In [7]:
data.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [8]:
data = to_clean(data)

In [9]:
to_save_cleaned_data(data, "data/data-cleaned.pkl")

Saved: data/data-cleaned.pkl


## Train/test split

In [14]:
from pickle import load
from pickle import dump
from data.const import N_ROWS, TEST_SIZE
from sklearn.model_selection import train_test_split

In [15]:
# chargement des données nettoyées
def load_clean_data(filename):
    return load(open(filename, 'rb'))

# sauvegarde d'une liste de phrase nettoyées dans un fichier
def save_clean_data(data, filename):
    dump(data, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [20]:
# chargement et réduction des données nettoyées
data = load_clean_data("data/data-cleaned.pkl")
data = data[:N_ROWS]

In [21]:
# séparation en train / test
test_size = int(TEST_SIZE*data.shape[0])
test_size

5341

In [22]:
train, test = train_test_split(data, 
                               test_size=test_size, 
                               random_state=0, 
                               stratify=data.iloc[:,-1])

In [23]:
train.shape, test.shape

((21368, 2), (5341, 2))

In [24]:
# sauvegarde dans deux fichiers
save_clean_data(data,  'output/data-cleaned-r{}-both.pkl'.format(N_ROWS))
save_clean_data(train, 'output/data-cleaned-r{}-train.pkl'.format(N_ROWS))
save_clean_data(test,  'output/data-cleaned-r{}-test.pkl'.format(N_ROWS))

Saved: output/data-cleaned-r26709-both.pkl
Saved: output/data-cleaned-r26709-train.pkl
Saved: output/data-cleaned-r26709-test.pkl
