In [1]:
import numpy as np
import nltk
import pandas as pd
import spacy
import re

#data scaling
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split , KFold, cross_val_score, LeaveOneOut
from sklearn.naive_bayes import MultinomialNB
from sklearn.calibration import CalibratedClassifierCV

# Please run the following command if needed: python -m spacy download fr_core_news_sm

In [2]:
# preprocessing
data = pd.read_csv("TrainingSet1.csv")

print( data )

#temporaire pour moins de données
#data = data.head()

intentions = data["intention"].unique()
data["intention"] = data["intention"].replace({"arrosage" : np.where( intentions == 'arrosage'),
                                              "soleil": np.where( intentions == 'soleil'), 
                                              "tailler" : np.where( intentions == 'tailler'),
                                              "temperature" : np.where( intentions == 'temperature'),
                                              "cadeaux" : np.where( intentions == 'cadeaux'),
                                              "varietes" : np.where( intentions == 'varietes'),
                                              "entretien" : np.where( intentions == 'entretien'),
                                               "utilisation" : np.where( intentions == 'utilisation'),
                                               "planter" : np.where( intentions == 'planter'),
                                               "maladies" : np.where( intentions == 'maladies'),
                                               "anecdotes" : np.where( intentions == 'anecdotes')
                                              })

                                              sentence  intention
0         Est ce que j'ai besoin d'arroser ma plante ?   arrosage
1                    Quand dois-je arroser ma plante ?   arrosage
2                          Dois-je arroser ma plante ?   arrosage
3                  Comment puis-je arroser ma plante ?   arrosage
4               Ma plante a besoin de beaucoup d'eau ?   arrosage
..                                                 ...        ...
226                       je veux savoir des anecdotes  anecdotes
227                            donne moi des anecdotes  anecdotes
228  quels sont les trucs cool à savoir sur ma plante?  anecdotes
229     dis moi des trucs cool à savoir sur ma plante?  anecdotes
230  raconte moi des trucs cool à savoir sur ma pla...  anecdotes

[231 rows x 2 columns]


In [3]:
y = list(data["intention"])

In [4]:
liste = data["sentence"].tolist()
j=0;

#dictionnaire des mots connus
words = list()

nlp_fr = spacy.load('fr_core_news_sm')

# process sentences
for i in data["sentence"]:
    # convert all letters to lower case
    i = i.lower()
    i = i.replace('-', ' ')
    
    regex = re.compile("plante([^r]|$)")
    i = regex.sub('', i)
    tokens = nlp_fr(i)

    new_sentence = ''
    for token in tokens:
            if (token.lemma_ != 't'): 
                new_sentence += str(token.lemma_) + ' '
                words.append(str(token.lemma_))
    
    if( new_sentence.strip() != "" ):
        liste[j] = new_sentence.strip();
        j += 1
    


In [12]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(liste).todense() #renvoie le bag of words

In [10]:
loo = LeaveOneOut()
loo.get_n_splits(X)

231

In [13]:
compt_svm = 0

for train_index, test_index in loo.split(X): 
    X_train = []
    y_train = []
    for index in train_index:
        X_train+=X[index].tolist()
        y_train.append(y[index])
    X_test = X[test_index].tolist()
    y_test = y[int(test_index)]
    
    clf_svm = CalibratedClassifierCV(svm.LinearSVC())
    clf_svm.fit(X_train, y_train)

    if int(clf_svm.predict(X_test)[0])==y_test:  # si prediction svm correcte
         compt_svm += 1 

In [14]:
print("Précision SVM :", compt_svm/len(X))

Précision SVM : 0.8614718614718615


In [15]:
from joblib import dump, load
dump(clf_svm, 'clf_svm.joblib')
dump(intentions, 'intentions.joblib')
dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']

In [16]:
# Avant de les charger à nouveau
clf_svm = load('clf_svm.joblib') 
intentions = load('intentions.joblib')

In [28]:
# Test with a simple sentence
sample_sentences = [ "j'aimerais entendre une anecdote", "quand dois je arroser ma plante ?", "les feuilles jaunissent", 
    "ma plante pousse mal",  "comment se débarasser des insectes ?" ]

for s in sample_sentences:
    # Pre-processing
    s = s.lower()

    regex = re.compile("plante([^r]|$)")
    s = regex.sub('', s)

    nlp_fr = "";
    nlp_fr = spacy.load('fr_core_news_sm')

    tokens = nlp_fr(s)

    words = list()

    # Lemmatize
    for token in tokens:
        if (token.lemma_ != 't'): 
            words.append(str(token.lemma_))


    j = 0;
    vector = vectorizer.get_feature_names()

    # Create vector
    for word in vector:
        vector[j] = words.count(word);           
        j += 1



    p = clf_svm.predict([vector])
    print(intentions[ int(p[0]) ])

anecdotes
arrosage
temperature
entretien
maladies


In [29]:
score = clf_svm.predict_proba([vector])
best = score[0][int(p[0])]

In [30]:
score

array([[0.03050785, 0.12579004, 0.05771501, 0.10133643, 0.05731796,
        0.0357291 , 0.12378126, 0.0875627 , 0.06567768, 0.28919623,
        0.02538573]])

In [31]:
score = np.delete(score, np.where(score == best))
second_best = np.amax(score)

print(best)
print(second_best)

0.2891962349602977
0.12579004165533272
