In [10]:
import joblib
import pandas as pd
import numpy as np
import string
import unicodedata
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords 
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from joblib import dump, load
import pickle

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# Import CSV with text clean and spellchecked
file = '/content/drive/MyDrive/OFF_Drive/raw_data/ocr_labeled_spellcheck.csv'
data = pd.read_csv(file)


In [4]:
data = data.copy()

In [5]:
data.shape

(434896, 6)

In [6]:
data.head()

Unnamed: 0,barcode,clean_text,fr_text,source,pnns_groups_1,pnns_groups_2
0,3199660476748,ne eleve abattu en bretagne les eleveurs de br...,NE\nELEVE\nABATTU\nEN BRETAGNE\nLES ÉLEVEURS\n...,/319/966/047/6748/1.json,fish meat eggs,meat
1,3199660219192,mer chant local decoupes de poulet conditionne...,Ker\nchant\n100% LOCAL\nDecoupes de\nPOULET\nC...,/319/966/021/9192/1.json,fish meat eggs,meat
2,3199660219192,cuis dej plat sat merchant ex origine france v...,1CUIS.DEJ. PLT SAT\nKERCHANT FX 1\nORIGINE Fra...,/319/966/021/9192/2.json,fish meat eggs,meat
3,3199660747848,mer chant local decoupes de poulet cooduogne p...,Ker\nChant\n100% LOGAL\nDécoupes de\nPOULET\nC...,/319/966/074/7848/1.json,fish meat eggs,meat
4,3196203800091,preparation deshydratee aromatisee et coloree ...,"PREPARATION DÉSHYDRATÉE, AROMATISÉE ET COLOREE...",/319/620/380/0091/1.json,sugary snacks,sweets


In [7]:
# Give a number to each catégories (for classification report)
category_codes_pnns_2 = {n: i for i, n in enumerate(sorted(data['pnns_groups_2'].unique()))}


In [8]:
# Holdout
X = data[['clean_text']]

y = data['pnns_groups_2']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

In [11]:
# Pipeline with stop words in tdidf with french nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('french'))
ridge_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 2))),
                ('Ridge', RidgeClassifier()),
            ])
  
# train the model using X_dtm & y
ridge_pipeline.fit(X_train.squeeze(), y_train)
# Squeeze is to get the correct shape for X (if none I have a 1 column after my rows)
# compute the testing accuracy
prediction = ridge_pipeline.predict(X_test.squeeze())
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))
print(classification_report(y_test, prediction, target_names=category_codes_pnns_2))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Test accuracy is 0.8454805356061593
                                  precision    recall  f1-score   support

                      appetizers       0.88      0.80      0.84      3575
artificially sweetened beverages       0.80      0.59      0.68       615
              biscuits and cakes       0.86      0.91      0.89     11401
                           bread       0.91      0.83      0.87      3311
               breakfast cereals       0.90      0.90      0.90      2149
                         cereals       0.84      0.82      0.83      5257
                          cheese       0.89      0.89      0.89      5937
              chocolate products       0.90      0.90      0.90      2554
                  dairy desserts       0.87      0.77      0.82      2943
            dressings and sauces       0.87      0.83      0.85      5200
                    dried fruits      

In [12]:
# Cross validation
from sklearn.model_selection import cross_validate

# 5-Fold Cross validate model
cv_results = cross_validate(ridge_pipeline, X_train.squeeze(), y_train, cv=5)

# Scores
cv_results['test_score']

# Mean of scores
cv_score = cv_results['test_score'].mean()

cv_score

0.8380137086094722

In [13]:
# all the scores
cv_results['test_score']



array([0.83726965, 0.83948691, 0.83889299, 0.83784183, 0.83657715])

In [14]:
model = ridge_pipeline.fit(X_train.squeeze(), y_train)

In [15]:
#Export model, joblib format
joblib.dump(model, '/content/drive/My Drive/OFF_Drive/raw_data/models/best_ridge_model_pnns_2_on_spellcheck.joblib') 

['/content/drive/My Drive/OFF_Drive/raw_data/models/best_ridge_model_pnns_2_on_spellcheck.joblib']

In [None]:
#to load jobil model
#loaded_model = joblib.load('/content/drive/My Drive/OFF_Drive/raw_data/models/ridge_only_pnns_2.joblib')
#result = loaded_model.score(X_test.squeeze(), y_test)
#loaded_model.predict([['clean_text'][0]])

In [16]:

# #Export model, pikle format
pkl_filename = '/content/drive/My Drive/OFF_Drive/raw_data/models/best_ridge_model_pnns_2_on_spellcheck.pkl'
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

# Load from file
#with open(pkl_filename, 'rb') as file:
    #pickle_model = pickle.load(file)
    

#Ypredict = pickle_model.predict(Xtest)