# Importation des librairies

In [1]:
import pandas as pd
import multiprocessing
import nltk
import re

In [2]:
from multiprocessing import Pool
from nltk.corpus import stopwords # Les stopwords sont des mots qui n'apportent pas de sens à la phrase
from nltk.stem import PorterStemmer # Le stemmer permet de réduire les mots à leur racine
from sklearn.feature_extraction.text import CountVectorizer # Permet de transformer le texte en vecteur (bag of words)
from sklearn.preprocessing import LabelEncoder

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

# Lecture des données

In [4]:
df_train = pd.read_csv('../data/train.csv')

In [6]:
# On affiche tous les URL ou label = bad
print(df_train[df_train['label'] == 'bad']['url'])

3                                      http://www.ff-b2b.de/
64         http://dracula-land.sexomultiple.com/cg-amateu...
114              http://www.angelfire.com/darkside/ms_sassy/
144                                  http://www.sexystat.com
167                               http://www.bizarre-sex.ws/
                                 ...                        
1199725                          http://www.coppiaperta.com/
1199728    http://www.hardcore-xxx-sex-sites.com/amanda-f...
1199827                     http://twowomen.com/a-team_babe/
1199859    http://www.lesbians-dykes.net/cute-latina-lesb...
1199910                       http://sichicago.blogspot.com/
Name: url, Length: 27253, dtype: object


In [7]:
# On sauvegarde les URL dans un fichier
df_train[df_train['label'] == 'bad']['url'].to_csv('../data/bad_url.csv', index=False)

In [41]:
# On recupère seulement les colonnes content, https, tld et label
df_train = df_train[['content', 'label']]

In [42]:
# On affiche les 5 premières lignes du dataframe
df_train.head()

Unnamed: 0,content,label
0,Named themselves charged particles in a manly ...,good
1,And filipino field \n \n \n \n \n \n \n \n the...,good
2,"Took in cognitivism, whose adherents argue for...",good
3,fire cumshot sodomize footaction tortur failed...,bad
4,"Levant, also monsignor georges. In 1800, lists...",good


In [43]:
df_test = pd.read_csv('../data/test.csv')

In [44]:
df_test = df_test[['content', 'label']]

In [45]:
# On affiche les 5 premières lignes du dataframe
df_test.head()

Unnamed: 0,content,label
0,"Decay suggest in 1315.. Current constitution, ...",good
1,breast addict nudger whash ky darkie catholics...,good
2,Nato's military stoic philosophy says to accep...,good
3,Night being newton. according to the formation...,good
4,34 per two children. if we exercise simple pra...,good


# Préparation des données

In [46]:
def preprocessing_content(df):
    # On commence par supprimer les stopwords
    stop_words = set(stopwords.words('english'))
    
    df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    
    # On supprime la ponctuation
    df['content'] = df['content'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))    

    # On supprime les chiffres
    df['content'] = df['content'].apply(lambda x: re.sub(r'\d+', '', x))
    
    # On supprime les espaces en trop
    df['content'] = df['content'].apply(lambda x: x.strip())
    
    # On supprime les mots de moins de 3 lettres
    df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))
    
    # On applique le stemmer
    stemmer = PorterStemmer()
    
    # On applique le stemmer qui réduit les mots à leur racine
    df['content'] = df['content'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
    
    return df

In [None]:
# On applique la fonction de preprocessing
df_train = preprocessing_content(df_train)

In [47]:
# On applique la fonction de preprocessing
df_test = preprocessing_content(df_test)

In [None]:
# On sauvegarde le dataframe
df_train.to_csv('../data/train_preprocessed.csv', index=False)

In [None]:
df_test.to_csv('../data/test_preprocessed.csv', index=False)