
<a href="http://www.inokufu.com"><img src = "http://www.inokufu.com/wp-content/uploads/elementor/thumbs/logo_inokufu_vector_full-black-om2hmu9ob1jytetxemkj1ij8g7tt3hzrtssivh2fl2.png" width = 400> </a>


<h1 align=center><font size = 5>Exploratory Data Analysis : Titre</font></h1>

## Introduction

In this notebook, we conduct an Exploratory Data Analysis (EDA) of data about data preprocessing collected from [udemy](https://udemy.com)'s API and the LO files. The idea is to better understand how to treat data correctly in order to give tokenized and cleaned data to Word2Vec models. 

Our EDA approach follows the **Data Science Methodology CRISP-DM**. For more info about this approach, check this [Wikipedia page](https://en.wikipedia.org/wiki/Cross-industry_standard_process_for_data_mining)

## Table of Contents

<div class="alert alert-block alert-info" style="margin-top: 20px">

<font size = 3>

1. <a href="#item1">Data Collection</a>

2. <a href="#item2">Creation of models</a>

3. <a href="#item3">Conclusion</a>    

</font>
</div>
<a id='the_destination'></a>

## 1. Data Collection <a id='item1'></a>

In [None]:
import numpy as np 
np.set_printoptions(threshold=10000,suppress=True) 
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.image as img
from matplotlib import rcParams

import json
import unicodedata

import seaborn as sns
from cycler import cycler

from bs4 import BeautifulSoup

import spacy
import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

import fr_core_news_sm
from spacy_langdetect import LanguageDetector

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from nltk.stem import SnowballStemmer

from gensim.models import Word2Vec
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, roc_auc_score

import gensim
import time 

import multiprocess
import multiprocessing

import import_ipynb
from GL_20200327_Fonctions_Preprocessing import remove_urls, remove_html, remove_antislash, convert_lower_case
from GL_20200327_Fonctions_Preprocessing import remove_quote, remove_back_quote, remove_interrogation_reverse
from GL_20200327_Fonctions_Preprocessing import remove_accents, remove_punctuation, remove_stop_words 
from GL_20200327_Fonctions_Preprocessing import remove_small_words, stemming, preprocess, preprocess_lemma 

print('Libraries imported.')

### Udemy data from Json files (coming from Udemy's API)

In [None]:
# Openning jsons files containing udemy's data

udemy_json = []
for page in range(1,27):
    page = page * 10
    
    with open('./data/20200317_Udemy_FR/20200317_Udemy_FR_'+str(page)+'.json') as f:
        udemy_json = udemy_json + json.load(f)

with open('./data/20200317_Udemy_FR/20200317_Udemy_FR_264.json') as f:
    udemy_json = udemy_json + json.load(f)

In [None]:
# Declare empty arrays of every variables we want to keep
desc, title, rating_dist, price_detail, avg_rating, num_sub = [],[],[],[],[],[]

# for each item of the array of json
for result in udemy_json:
    
    # Choosing only the variables that we need
    desc.append(result['description'])
    # head.append(result['headline'])
    # rating_dist.append(result['rating_distribution'])
    avg_rating.append(result['avg_rating'])
    title.append(result['title'])
    num_sub.append(result['num_subscribers'])

    if result['price_detail']:
        price_detail.append(result['price_detail']['amount'])
    else:
        price_detail.append(0.0)
    
# Creation of a dataframe containing all these variables 
df_json = pd.DataFrame([desc, price_detail, title, num_sub, avg_rating]).transpose()

In [None]:
# Further ameliorations : add some other variables 
df_json.columns = ["description", "price_detail", "title", "num_sub", "avg_rating"]
df_json.head(3)

In [None]:
# Removing null average ratings, because it is usually not filled and not a bad mark on purpose
df_json = df_json.loc[df_json['avg_rating'] != 0]

# Reseting indexes
df_json = df_json.reset_index(drop=True)

# Creating new column, and filling it by NaN
df_json = df_json.assign(counter='Nan')

# For each line in the dataframe of the udemy data
for index,row in df_json.iterrows():
    
    # Preprocessing the description --> stemmer
    preprocessed_data = preprocess(row['description'])
    df_json.at[index,'stem_description'] = str(preprocessed_data)
    
    # Preprocessing the description --> lemmatizer
    preprocessed_data = preprocess_lemma(row['description'])
    df_json.at[index,'lemma_description'] = str(preprocessed_data)
    
    # Preprocessing the title, and replace the classic title by the processed title
    preprocessed_title = preprocess(row['title'])
    df_json.at[index,'title'] = str(preprocessed_title)
    
    # Calculating the number of occurances of words in the processed description, and add it to the DF
    frequence = Counter(word_tokenize(str(preprocessed_data)))
    df_json.at[index,'counter'] = frequence
    
    # Splitting avg_rating into 2 values to do some classification, and add it to the rating_01
    if row['avg_rating'] >= 4.252:
        value = int(1)
    else:
        value = int(0)
        
    df_json.at[index,'rating_01'] = value

In [None]:
print(df_json['rating_01'].value_counts())

In [None]:
df_json.head(3)

In [None]:
# Splitting data into 
corpus = df_json['lemma_description']
title = df_json['title']
Y = df_json['rating_01'].astype(int)

In [None]:
# Wrinting df json treated to a file

# Converting ratings from float to int
df_json['rating_01'] = df_json['rating_01'].astype(int)

begin = time.time()
df_json.to_csv('./data/20200408_Processed_Data/20200408_Processed_Udemy_Json.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

### Formations data from LO csv files (coming from extract of DB)

In [None]:
# Openning all the formations files
path = './data/20200308 LO/LO'
file_extension = '.csv'

frames = []
keys = []

for i in range(1,15):
    df = pd.read_csv(path+str(i)+file_extension)
    frames.append(df)

df_formation = pd.concat(frames)
df_formation.shape

#### Processing Descriptions of Formations

In [None]:
# Keeping only the descriptions, and only unique values

desc = df_formation.loc[:,['description']]
desc = desc.drop_duplicates()

In [None]:
# Removing null descriptions, and reseting indexes 

desc = desc.loc[desc['description'].isnull() == False]
desc = desc.reset_index(drop=True)

In [None]:
# Processing all the descriptions 
count = 0

# For each descriptions in the desc DF
for index,row in desc.iterrows():
    
    if count % 50 == 0:
        # Calculating the percentage of processed data 
        calcul_percent = round((count/len(desc))*100,2)
        print("Loading...",calcul_percent,"%",end="\r")
    count = count + 1
    
    # Processing descriptions of formations
    preprocessed_data = preprocess(row['description'])
    preprocessed_lemma_data = preprocess_lemma(row['description'])
    desc.at[index,'description_bis'] = str(preprocessed_data)
    desc.at[index,'description_ter'] = str(preprocessed_lemma_data)
    
print("Loading... 100.00 %",end="\r")

#### Processing Objectives of Formations

In [None]:
# Keeping only the objectives, and only unique values

obj = df_formation.loc[:,['objectifs']]
obj = obj.drop_duplicates()

In [None]:
# Removing null objectives, and reseting indexes 

obj = obj.loc[obj['objectifs'].isnull() == False]
obj = obj.reset_index(drop=True)

In [None]:
# Processing all the objectives 
count = 0

for index,row in obj.iterrows():
    
    if count % 50 == 0:
        # Calculating the percentage of processed data 
        calcul_percent = round((count/len(desc))*100,2)
        print("Loading...",calcul_percent,"%",end="\r")
    count = count + 1
    
    # Processing objectives of formations
    preprocessed_data = preprocess(row['objectifs'])
    preprocessed_lemma_data = preprocess_lemma(row['objectifs'])
    obj.at[index,'objectifs_bis'] = str(preprocessed_data)
    obj.at[index,'objectifs_ter'] = str(preprocessed_lemma_data)

print("Loading... 100.00 %",end="\r")

#### Keeping only the processed data and removing duplicated and null values

In [None]:
# Keeping only the processed data
#desc_formation = desc['description_bis']
#obj_formation = obj['objectifs_bis']

desc_formation = desc['description_ter']
obj_formation = obj['objectifs_ter']

# Removing null values 
desc_formation = desc_formation.loc[desc_formation.isnull() == False]
obj_formation = obj_formation.loc[obj_formation.isnull() == False]

# Dropping duplacted values
desc_formation = desc_formation.drop_duplicates()
obj_formation = obj_formation.drop_duplicates()

# Reseting indexes 
desc_formation = desc_formation.reset_index(drop=True)
obj_formation = obj_formation.reset_index(drop=True)

#### Writing processed data to files containing Text 

In [None]:
# Wrinting processed descriptions (still as a text) to a file

begin = time.time()
desc_formation.to_csv('./data/20200408_Processed_Data/20200408_Processed_Descriptions_Text.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

In [None]:
# Wrinting processed objectives (still as a text) to a file

begin = time.time()
obj_formation.to_csv('./data/20200408_Processed_Data/20200408_Processed_Objectives_Text.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

In [None]:
# Wrinting processed udemy's descriptions (still as a text) to a file

begin = time.time()
corpus.to_csv('./data/20200408_Processed_Data/20200408_Processed_Udemy_Descriptions_Text.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

In [None]:
# Wrinting processed udemy's titles (still as a text) to a file

begin = time.time()
title.to_csv('./data/20200408_Processed_Data/20200408_Processed_Udemy_Title_Text.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

#### Tokenizing data 

In [None]:
# Simply processing data with gensim function 
# --> removing stop words + tokenization 

corpus = corpus.apply(lambda line: gensim.utils.simple_preprocess(line))
title = title.apply(lambda line: gensim.utils.simple_preprocess(line))

In [None]:
# Simply processing data with gensim function 
# --> removing stop words + tokenization 

desc_formation = desc_formation.apply(lambda line: gensim.utils.simple_preprocess(line))
obj_formation = obj_formation.apply(lambda line: gensim.utils.simple_preprocess(line))

#### Concatening all treated and tokenized dataframes

In [None]:
# Concatenate all treated dataframes to a final corpus dataframe, and reseting indexes

final_corpus = pd.concat([desc_formation, obj_formation, corpus])
final_corpus = final_corpus.reset_index(drop=True)

In [None]:
final_corpus.head(3)

#### Writing processed data to files containing Tokens

In [None]:
# Wrinting processed corpus (as tokens) to a file

begin = time.time()
corpus.to_csv('./data/20200408_Processed_Data/20200408_Processed_Udemy_Descriptions_Tokens.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

In [None]:
# Wrinting processed title (as tokens) to a file

begin = time.time()
title.to_csv('./data/20200408_Processed_Data/20200408_Processed_Udemy_Titles_Tokens.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

In [None]:
# Wrinting processed descriptions (as tokens) to a file

begin = time.time()
desc_formation.to_csv('./data/20200408_Processed_Data/20200408_Processed_Descriptions_Tokens.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

In [None]:
# Wrinting processed objectives (still as a text) to a file

begin = time.time()
obj_formation.to_csv('./data/20200408_Processed_Data/20200408_Processed_Objectives_Tokens.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

### New LO from CPF

In [None]:
# Openning all the formations files
path = './data/'
file_name = '20200407_LO_MonCompteFormation'
file_extension = '.csv'

df = pd.read_csv(path+file_name+file_extension)

df.shape

In [None]:
# Keeping only the descriptions, and only unique values

desc = df.loc[:,['description']]
desc = desc.drop_duplicates()

In [None]:
desc.shape

In [None]:
# Removing null descriptions, and reseting indexes 

desc = desc.loc[desc['description'].isnull() == False]
desc = desc.reset_index(drop=True)

In [None]:
desc.shape

In [None]:
# Processing all the descriptions 
count = 0

# For each descriptions in the desc DF
for index,row in desc.iterrows():
    
    if count % 50 == 0:
        # Calculating the percentage of processed data 
        calcul_percent = round((count/len(desc))*100,2)
        print("Loading...",calcul_percent,"%",end="\r")
    count = count + 1
    
    # Processing descriptions of formations
    preprocessed_data = preprocess(row['description'])
    preprocessed_lemma_data = preprocess_lemma(row['description'])
    desc.at[index,'description_bis'] = str(preprocessed_data)
    desc.at[index,'description_ter'] = str(preprocessed_lemma_data)
    
print("Loading... 100.00 %",end="\r")

In [None]:
# Keeping only the objectives, and only unique values

obj = df.loc[:,['objectifs']]
obj = obj.drop_duplicates()

In [None]:
# Removing null objectives, and reseting indexes 

obj = obj.loc[obj['objectifs'].isnull() == False]
obj = obj.reset_index(drop=True)

In [None]:
obj.shape

In [None]:
# Processing all the objectives 
count = 0

for index,row in obj.iterrows():
    
    if count % 50 == 0:
        # Calculating the percentage of processed data 
        calcul_percent = round((count/len(desc))*100,2)
        print("Loading...",calcul_percent,"%",end="\r")
    count = count + 1
    
    # Processing objectives of formations
    preprocessed_data = preprocess(row['objectifs'])
    preprocessed_lemma_data = preprocess_lemma(row['objectifs'])
    obj.at[index,'objectifs_bis'] = str(preprocessed_data)
    obj.at[index,'objectifs_ter'] = str(preprocessed_lemma_data)

print("Loading... 100.00 %",end="\r")

In [None]:
# Keeping only the processed data
#desc_formation = desc['description_bis']
#obj_formation = obj['objectifs_bis']

desc_formation = desc['description_ter']
obj_formation = obj['objectifs_ter']

# Removing null values 
desc_formation = desc_formation.loc[desc_formation.isnull() == False]
obj_formation = obj_formation.loc[obj_formation.isnull() == False]

# Dropping duplacted values
desc_formation = desc_formation.drop_duplicates()
obj_formation = obj_formation.drop_duplicates()

# Reseting indexes 
desc_formation = desc_formation.reset_index(drop=True)
obj_formation = obj_formation.reset_index(drop=True)

In [None]:
# Wrinting processed descriptions (still as a text) to a file

begin = time.time()
desc_formation.to_csv('./data/20200408_Processed_Data/20200410_Processed_Descriptions_Text_NewLO.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

In [None]:
# Wrinting processed objectives (still as a text) to a file

begin = time.time()
obj_formation.to_csv('./data/20200408_Processed_Data/20200410_Processed_Objectives_Text_NewLO.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

In [None]:
# Simply processing data with gensim function 
# --> removing stop words + tokenization 

desc_formation = desc_formation.apply(lambda line: gensim.utils.simple_preprocess(line))
obj_formation = obj_formation.apply(lambda line: gensim.utils.simple_preprocess(line))

In [None]:
# Wrinting processed descriptions (as tokens) to a file

begin = time.time()
desc_formation.to_csv('./data/20200408_Processed_Data/20200410_Processed_Descriptions_Tokens_NewLO.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))

In [None]:
# Wrinting processed objectives (still as a text) to a file

begin = time.time()
obj_formation.to_csv('./data/20200408_Processed_Data/20200410_Processed_Objectives_Tokens_NewLO.csv',sep=';')
end = time.time()

print("Done in {} seconds".format(round(end-begin,2)))


<hr>

Author [Guillaume Lefebvre](https://www.linkedin.com/in/guillaume-lefebvre-22117610b/) - For more information, contact us at contact@inokufu.com - Copyright &copy; 2020 [Inokufu](http://www.inokufu.com)

<a href="http://www.inokufu.com"><img src = "http://www.inokufu.com/wp-content/uploads/elementor/thumbs/logo_inokufu_vector_full-black-om2hmu9ob1jytetxemkj1ij8g7tt3hzrtssivh2fl2.png" width = 400> </a>


