In [1]:
!pip install pandas
!pip install nltk
!pip install scikit-learn
!pip install tiktoken

import pandas as pd
import tiktoken
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def tokenizer(X):
    enc = tiktoken.encoding_for_model("gpt-4")
    stop_words = set(stopwords.words('english'))
    num_tokens = 0
    num_sentences = 0
    tokenized_documents = []
    for text in X:
        tokenized_document = enc.encode(text)

        filtered_tokens = [token for token in tokenized_document if token not in stop_words]

        num_tokens += len(filtered_tokens)
        num_sentences += text.count('.') + text.count('!') + text.count('?')
        
        tokenized_documents.append(filtered_tokens)

    print("Number of tokens: ", num_tokens)
    print("Number of sentences: ", num_sentences)
    
    return tokenized_documents

import sklearn.feature_extraction.text as sk_text

def vectorizer(tokenized_documents):
    # Conversion des bytes en chaînes de caractères si nécessaire
    tokenized_documents_str = []
    for document in tokenized_documents:
        if isinstance(document, list):
            # Convertir les bytes en chaînes de caractères
            tokenized_document_str = [token.decode('utf-8') if isinstance(token, bytes) else str(token) for token in document]
            tokenized_documents_str.append(tokenized_document_str)
        else:
            # Si le document n'est pas une liste, le convertir en liste de chaîne de caractères
            tokenized_documents_str.append([str(document)])

    # Initialiser le vectoriseur
    vectorizer = sk_text.CountVectorizer(lowercase=False, preprocessor=None, tokenizer=lambda x: x)
    
    # Adapter et transformer les données
    X = vectorizer.fit_transform(tokenized_documents_str)
    
    return X, vectorizer



[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [2]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer

import sklearn.model_selection
import numpy as np
import pandas as pd
import sklearn as sk

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data = pd.read_csv('../../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score']

In [4]:
# # FOR TESTING : only select first 20000 samples
# X, y = X[:20000], y[:20000]

In [5]:
tokenized_documents = tokenizer(X)

Number of tokens:  58325048
Number of sentences:  3661772


In [6]:
X, vect = vectorizer(tokenized_documents)



In [7]:
# most frequent words
print("Top 10 most frequent words in the dataset")
print(vect.get_feature_names_out()[:10])

# least frequent words
print("Top 10 least frequent words in the dataset")
print(vect.get_feature_names_out()[-10:])

Top 10 most frequent words in the dataset
['0' '1' '10' '100004' '100005' '10001' '100011' '100012' '100014'
 '100016']
Top 10 least frequent words in the dataset
['99975' '9998' '99981' '99984' '99985' '99986' '9999' '99990' '99992'
 '99994']


In [8]:
assert X.shape[0] == y.shape[0], "Le nombre d'échantillons dans X et y est différent."

# Divisez les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train[0])

(454763, 53351) (113691, 53351) (454763,) (113691,)
  (0, 34404)	1
  (0, 2531)	1
  (0, 23716)	1
  (0, 19965)	2
  (0, 22878)	1
  (0, 23523)	1
  (0, 899)	2
  (0, 5502)	1
  (0, 1672)	1
  (0, 30112)	1
  (0, 0)	1
  (0, 51388)	1
  (0, 39869)	1
  (0, 20900)	1
  (0, 932)	2
  (0, 52850)	1
  (0, 36993)	1
  (0, 14313)	1
  (0, 6293)	1
  (0, 24280)	1
  (0, 2731)	1
  (0, 656)	1
  (0, 1347)	1


# Naive Bayes Classifier
### Model starts here

In [11]:
from sklearn.naive_bayes import MultinomialNB

# Multinomial Naive Bayes
nb = MultinomialNB()
naive_bayes = nb.fit(X_train, y_train)
predicted = naive_bayes.predict(X_test)

# Metrics
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predicted))
print(metrics.classification_report(y_test, predicted))

[[ 6373  1573   728   386  1266]
 [ 1211  2081  1023   586   954]
 [ 1036   967  2989  1710  1783]
 [  834   563  1499  6223  7004]
 [ 2762   772  1549  7658 60161]]
              precision    recall  f1-score   support

           1       0.52      0.62      0.57     10326
           2       0.35      0.36      0.35      5855
           3       0.38      0.35      0.37      8485
           4       0.38      0.39      0.38     16123
           5       0.85      0.83      0.84     72902

    accuracy                           0.68    113691
   macro avg       0.50      0.51      0.50    113691
weighted avg       0.69      0.68      0.69    113691



In [12]:
example = '''
awful
'''

# tokenize and vectorize it, then try to predict
test = tokenizer([example])
test_tfidf = vect.transform(test)
print(naive_bayes.predict(test_tfidf))

Number of tokens:  4
Number of sentences:  0
[5]
