
<a href="http://www.inokufu.com"><img src = "http://www.inokufu.com/wp-content/uploads/elementor/thumbs/logo_inokufu_vector_full-black-om2hmu9ob1jytetxemkj1ij8g7tt3hzrtssivh2fl2.png" width = 400> </a>


<h1 align=center><font size = 5>Exploratory Data Analysis : Data Processing</font></h1>

## Introduction

In this notebook, we keep the different functions that we will have to use for data processing.

Our EDA approach follows the **Data Science Methodology CRISP-DM**. For more info about this approach, check this [Wikipedia page](https://en.wikipedia.org/wiki/Cross-industry_standard_process_for_data_mining)

In [None]:
import numpy as np 
np.set_printoptions(threshold=10000,suppress=True) 
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.image as img
from matplotlib import rcParams

import json
import unicodedata

import seaborn as sns
from cycler import cycler

from bs4 import BeautifulSoup

import spacy
import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

import fr_core_news_sm
from spacy_langdetect import LanguageDetector

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from nltk.stem import SnowballStemmer

from gensim.models import Word2Vec
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, roc_auc_score

import gensim
import time 

import multiprocess
import multiprocessing

import import_ipynb

In [None]:
nlp = fr_core_news_sm.load()

In [None]:
def remove_urls (data):
    data = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', ' ', data, flags=re.MULTILINE)
    return data

def remove_html(data):
    return BeautifulSoup(data).get_text(separator=" ").strip()

def remove_quote(data):
    return data.replace("'"," ")

def remove_special_quote(data):
    return data.replace("’"," ")

def remove_back_quote(data):
    return data.replace("`"," ")

def remove_multiple_space(data):
    return ' '.join(data.split())

def remove_interrogation_reverse(data):
    return data.replace("¿"," ")

def convert_lower_case(data):
    return np.char.lower(data)

def remove_antislash(data):
    symbols = ["\n", "\t", "\r"]
    for i in range(len(symbols)):
        data = data.replace(symbols[i]," ")
    return data

def remove_accents(data):
    data = ''.join((c for c in unicodedata.normalize('NFD', data) if unicodedata.category(c) != 'Mn'))
    return data

def remove_punctuation(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_string = re.sub('[^a-zA-Z]+', '', w)
        if len(new_string) > 0:
            new_text = new_text + " " + new_string
    return new_text

def stemming(data):
    stemmer = SnowballStemmer('french')
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def remove_stop_words(data):
    stop_words = stopwords.words('french')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words:
            if len(w.strip()) > 2:
                new_text = new_text + " " + w
    return new_text

def remove_small_words(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if len(w.strip()) > 2:
            new_text = new_text + " " + w
    return new_text

In [None]:
def preprocess(string_to_test):
    string_to_test = str(remove_urls(string_to_test))
    string_to_test = str(remove_html(string_to_test))
    string_to_test = str(remove_antislash(string_to_test))
    
    string_to_test = str(convert_lower_case(string_to_test))
    
    string_to_test = str(remove_quote(string_to_test))
    string_to_test = str(remove_special_quote(string_to_test))
    string_to_test = str(remove_back_quote(string_to_test))
    string_to_test = str(remove_interrogation_reverse(string_to_test))
    string_to_test = str(remove_multiple_space(string_to_test))
    
    string_to_test = str(remove_accents(string_to_test))
    string_to_test = str(remove_punctuation(string_to_test))
    
    string_to_test = str(remove_stop_words(string_to_test))
    
    string_to_test = str(string_to_test.strip())
    
    string_to_test = str(remove_small_words(string_to_test))
    string_to_test = str(stemming(string_to_test))
    
    string_to_test = str(string_to_test.strip())
    
    return string_to_test

In [None]:
def preprocess_lemma(string_to_test):
    string_to_test = str(remove_urls(string_to_test))
    string_to_test = str(remove_html(string_to_test))
    string_to_test = str(remove_antislash(string_to_test))
    
    string_to_test = str(convert_lower_case(string_to_test))
    
    string_to_test = str(remove_quote(string_to_test))
    string_to_test = str(remove_special_quote(string_to_test))
    string_to_test = str(remove_back_quote(string_to_test))
    string_to_test = str(remove_interrogation_reverse(string_to_test))
    string_to_test = str(remove_multiple_space(string_to_test))
    
    doc = nlp(string_to_test)
    new_string = ""
    for token in doc:
        new_string += token.lemma_ + " "
    string_to_test = new_string
    
    string_to_test = str(remove_accents(string_to_test))
    string_to_test = str(remove_punctuation(string_to_test))
    
    string_to_test = str(remove_stop_words(string_to_test))
    
    string_to_test = str(string_to_test.strip())
    
    string_to_test = str(remove_small_words(string_to_test))
    string_to_test = str(stemming(string_to_test))
    
    string_to_test = str(string_to_test.strip())
    
    return string_to_test 


<hr>

Author [Guillaume Lefebvre](https://www.linkedin.com/in/guillaume-lefebvre-22117610b/) - For more information, contact us at contact@inokufu.com - Copyright &copy; 2020 [Inokufu](http://www.inokufu.com)

<a href="http://www.inokufu.com"><img src = "http://www.inokufu.com/wp-content/uploads/elementor/thumbs/logo_inokufu_vector_full-black-om2hmu9ob1jytetxemkj1ij8g7tt3hzrtssivh2fl2.png" width = 400> </a>


