In [2]:
import pandas as pd
import sklearn.model_selection as sms
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from spacy.lang.fr import French
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pickle

In [3]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danbonan/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danbonan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Construction d'un modèle n-gram

In [4]:
# On ne tiendra pas compte de l'ordre des mots
# On load les données
path_data = "db-text-for-bloc-labelization-nlp.csv"
df = pd.read_csv(path_data)
# le CSV est une simple df a deux colonnes: C1=texte -- C2=label

In [5]:
df.head()

Unnamed: 0,label,text,file
0,info_perso,Maeva DENISE SAUTDELIGNE 2 Rue Robert Schuman ...,CV ASSISTANTE DE DIRECTION _ DENISE Maëva.jpg
1,skills,LOGICIELS MAITRISES: WORD -- EXCEL -- SAGE -- ...,CV ASSISTANTE DE DIRECTION _ DENISE Maëva.jpg
2,skills,COMPETENCES SAUTDELIGNE SAUTDELIGNE VV VV VV ...,CV ASSISTANTE DE DIRECTION _ DENISE Maëva.jpg
3,experience_pro,EXPERIENCES PROFESSIONNELLE SAUTDELIGNE SAUTD...,CV ASSISTANTE DE DIRECTION _ DENISE Maëva.jpg
4,formation,PROT AR OL AVEO) BS SAUTDELIGNE SAUTDELIGNE >...,CV ASSISTANTE DE DIRECTION _ DENISE Maëva.jpg


In [6]:
nlp = French()

## Some data cleaning

In [7]:
def spacy_tokenizer(sentence, stopwords):
    # On crée notre objet token, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence)
    # On lemmatize chaque token et on convertit en lowercase (lemmatization=forme cannonique/standard d'un mot)
    lemmatizer = WordNetLemmatizer()
    mytokens = [lemmatizer.lemmatize(str(word)).lower() for word in mytokens if not     
          word.is_punct and not word.like_num and word.text != 'n']
    # Removing stop words
    #print(mytokens)
    mytokens = [word for word in mytokens if word not in stopwords.words()]
    mytokens = [word for word in mytokens if word != 'sautdeligne']
    # Remove accents
    #mytokens = [strip_accents_ascii(word) for word in mytokens]
    # return preprocessed list of tokens
    return mytokens

# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)
# Limitation du nombre de features
TOP_K = 20000
# On split le texte en "mots" ou "n-grams"
# One of 'word', 'char'.
TOKEN_MODE = 'word'
# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

In [8]:
df = df.dropna()

## La cellule suivante met du temps a run (5mn)

In [9]:
text = df.text.tolist()
new_text = [spacy_tokenizer(s, stopwords) for s in text]

In [10]:
df['new_text'] = [' '.join(s) for s in new_text]

In [11]:
df.head(2)

Unnamed: 0,label,text,file,new_text
0,info_perso,Maeva DENISE SAUTDELIGNE 2 Rue Robert Schuman ...,CV ASSISTANTE DE DIRECTION _ DENISE Maëva.jpg,maeva denise rue robert schuman villeneuve-l...
1,skills,LOGICIELS MAITRISES: WORD -- EXCEL -- SAGE -- ...,CV ASSISTANTE DE DIRECTION _ DENISE Maëva.jpg,logiciels maitrises word excel sage powerpoint...


In [12]:
print(len(df), ' rows in our DB')

1116  rows in our DB


In [13]:
df['new_text'][7]

'competences   langues anglais langue maternelle   creole jamaique sierra leone   teed     poular guinee   >   sos   boc   susu guinee     bot   mende sierra leone   informatique logiciels     bot   pack office dax   lv talent ardian tetrawin simm   akio     bot     bot     bot   ss     ss     ss  '

#### Il reste pas mal de data cleaning a faire

## Modeling

In [14]:
# train test split
data_train = df[:820]
data_valid = df[820:]

In [15]:
def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.
    # Inputs
      train_texts: list, training text strings.
      train_labels: np.ndarray, training labels.
      val_texts: list, validation text strings.
    # Outputs 
       x_train, x_val: vectorized training and validation texts
    """
    # Paramètres qu'on passe au TF-IDF
    kwargs = {
      'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
      'dtype': 'int32',
      'strip_accents': 'unicode',
      'decode_error': 'replace', 
      'analyzer': TOKEN_MODE,  # Split text into word tokens.
       'min_df': MIN_DOCUMENT_FREQUENCY,
       }
    vectorizer = TfidfVectorizer(**kwargs)
    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)
    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)
    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val, vectorizer, selector

In [16]:
x_train, x_val, vectorizer, selector = ngram_vectorize(data_train["new_text"], data_train["label"], data_valid["new_text"])



## Choix du modèle
### a - Naive Bayes

In [17]:
y_train = data_train["label"]
y_test = data_valid["label"]
NB_clf = MultinomialNB()
NB_clf.fit(x_train, y_train)
predicted = NB_clf.predict(x_val)

In [18]:
print(metrics.classification_report(y_test, predicted))

                precision    recall  f1-score   support

experience_pro       0.81      1.00      0.89        68
     formation       1.00      0.91      0.95        65
    info_perso       1.00      0.95      0.98        65
        skills       0.95      0.88      0.91        98

      accuracy                           0.93       296
     macro avg       0.94      0.93      0.93       296
  weighted avg       0.94      0.93      0.93       296



### b - Bagging

In [19]:
bag_clf =BaggingClassifier(DecisionTreeClassifier(),n_estimators=50, n_jobs=-1)
bag_clf.fit(x_train, y_train)
predicted = bag_clf.predict(x_val)
print(metrics.classification_report(y_test, predicted))

                precision    recall  f1-score   support

experience_pro       0.95      0.93      0.94        68
     formation       0.95      0.97      0.96        65
    info_perso       0.90      0.98      0.94        65
        skills       0.99      0.94      0.96        98

      accuracy                           0.95       296
     macro avg       0.95      0.95      0.95       296
  weighted avg       0.95      0.95      0.95       296



## Save/Load model

In [29]:
nlp_model = 'model_block_classification.pkl'
vectorizer_model = 'vectorizer_block_classification.pkl'
selector_model = 'selector_block_classification.pkl'

In [30]:
# save the model to disk
pickle.dump(bag_clf, open(nlp_model, 'wb'))
pickle.dump(vectorizer, open(vectorizer_model, 'wb'))
pickle.dump(selector, open(selector_model, 'wb'))

In [28]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(x_val)
#print(result)

## Apply model on new datas

In [25]:
def new_predict(model, sample, vectorizer, selector):
    """Input: model (pkl file), string text, vectorizer and selector (2 methods returned by training
    for preprocessing)
       Output: prediction (block label)"""
    new_sample = pd.Series(sample)
    sample_preprocessed = vectorizer.transform(new_sample)
    sample_preprocessed = selector.transform(sample_preprocessed)
    prediction = model.predict(sample_preprocessed)
    return prediction

In [26]:
new_predict(loaded_model, pd.Series(df['new_text'][0]), vectorizer, selector)

array(['info_perso'], dtype=object)

In [4]:
import numpy as np
l = np.array(['info_perso'], dtype=object)

In [6]:
list(l)

['info_perso']