# traditional machine learning method for text classfication

In [1]:
from datasets import load_dataset

train_dataset = load_dataset("glue", "sst2", split="train")
val_dataset = load_dataset("glue", "sst2", split="validation")
test_dataset = load_dataset("glue", "sst2", split="test")

Reusing dataset glue (/home/wzm289/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/wzm289/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/wzm289/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [14]:
train_dataset

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

## extracted text feature

In [15]:
# bag of words, occurrence

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_dataset['sentence'])
X_train_counts.shape

(67349, 13774)

In [17]:
# occurence is a good start: there is a issue: longer document have higher average count,
# So we use the term frequence

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf = False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape


(67349, 13774)

In [27]:
# the two steps can be combied to achieve the same end .

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape



(67349, 13774)

## training a classifier

In [20]:
# first I will use the naive bayes to conduct the classification

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf,train_dataset['label'])

## building pipeling

In [31]:
# test some cases
print(val_dataset[:2])
X_new_counts = count_vect.transform(val_dataset[:2]['sentence'])

X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
print(predicted)

{'sentence': ["it 's a charming and often affecting journey . ", 'unflinchingly bleak and desperate '], 'label': [1, 0], 'idx': [0, 1]}
[1 0]


## evaluation of performance

In [34]:
# to make the vectorizer->transofmer -> classifer be a pipeline

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('vect',CountVectorizer()),
('tfidf',TfidfTransformer()),
('clf',MultinomialNB())
])

In [35]:
text_clf.fit(train_dataset['sentence'],train_dataset['label'])

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

## evaluation of the performance on the test set

In [44]:
import numpy as np
predicted = text_clf.predict(val_dataset['sentence'])
np.mean(predicted == val_dataset['label'])

0.8004587155963303

In [48]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('clf',SGDClassifier()),
])

In [49]:
text_clf_svm.fit(train_dataset['sentence'],train_dataset['label'])

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier())])

In [50]:
predicted = text_clf_svm.predict(val_dataset['sentence'])
np.mean(predicted == val_dataset['label'])

0.8211009174311926

In [52]:
from sklearn import metrics
print(metrics.classification_report(val_dataset['label'],predicted))

              precision    recall  f1-score   support

           0       0.83      0.79      0.81       428
           1       0.81      0.85      0.83       444

    accuracy                           0.82       872
   macro avg       0.82      0.82      0.82       872
weighted avg       0.82      0.82      0.82       872



## grid search


In [53]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range':[(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [55]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(train_dataset[:400]['sentence'], train_dataset[:400]['label'])

In [56]:
gs_clf.best_score_

0.6875

## 