# Simple classifier

In [1]:
%run -i "../util/file_utils.ipynb"
%run -i "../util/lang_utils.ipynb"

c:\Users\ravik\Documents\nlp_cookbook\Python-Natural-Language-Processing-Cookbook-Second-Edition\Chapter03


In [3]:
from datasets import load_dataset
train_dataset = load_dataset("rotten_tomatoes", split="train[:15%]+train[-15%:]")
test_dataset = load_dataset("rotten_tomatoes", split="test[:15%]+test[-15%:]")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 8530/8530 [00:00<00:00, 289815.33 examples/s]
Generating validation split: 100%|██████████| 1066/1066 [00:00<00:00, 52678.98 examples/s]
Generating test split: 100%|██████████| 1066/1066 [00:00<00:00, 57623.57 examples/s]


In [4]:
print(len(train_dataset))
print(len(test_dataset))

2560
320


In [5]:
class POS_vectorizer:
    def __init__(self, spacy_model):
        self.model = spacy_model
    
    def vectorize(self, input_text):
        doc = self.model(input_text)
        vector = []
        vector.append(len(doc))
        pos = {"VERB":0, "NOUN":0, "PROPN":0, "ADJ":0, "ADV":0, "AUX":0, "PRON":0, "NUM":0, "PUNCT":0}
        for token in doc:
            if token.pos_ in pos.keys():
                pos[token.pos_] += 1
        vector_values = list(pos.values())
        vector = vector + vector_values
        return vector

In [6]:
sample_text = train_dataset[0]["text"]
vectorizer = POS_vectorizer(small_model)
vector = vectorizer.vectorize(sample_text)

In [7]:
print(sample_text)
print(vector)

the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
[38, 3, 8, 2, 5, 1, 3, 1, 0, 5]


In [8]:
import pandas as pd
import numpy as np
train_df = train_dataset.to_pandas()
train_df.sample(frac=1)
test_df = test_dataset.to_pandas()
train_df["vector"] = train_df["text"].apply(lambda x: vectorizer.vectorize(x))
test_df["vector"] = test_df["text"].apply(lambda x: vectorizer.vectorize(x))
X_train = np.stack(train_df["vector"].values, axis=0)
X_test = np.stack(test_df["vector"].values, axis=0)
y_train = train_df["label"].to_numpy()
y_test = test_df["label"].to_numpy()

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
clf = LogisticRegression(C=0.1)
clf = clf.fit(X_train, y_train)

In [11]:
test_df["prediction"] = test_df["vector"].apply(lambda x: clf.predict([x])[0])
print(classification_report(test_df["label"], test_df["prediction"]))

              precision    recall  f1-score   support

           0       0.58      0.56      0.57       160
           1       0.58      0.59      0.58       160

    accuracy                           0.58       320
   macro avg       0.58      0.58      0.58       320
weighted avg       0.58      0.58      0.58       320



In [12]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [13]:
def load_train_test_dataset_pd():
 train_dataset = load_dataset("rotten_tomatoes",
 split="train[:15%]+train[-15%:]")
 test_dataset = load_dataset("rotten_tomatoes",
 split="test[:15%]+test[-15%:]")
 train_df = train_dataset.to_pandas()
 train_df.sample(frac=1)
 test_df = test_dataset.to_pandas()
 return (train_df, test_df)

In [14]:
def create_train_test_data(train_df, test_df, vectorize):
 train_df["vector"] = train_df["text"].apply(
 lambda x: vectorize(x))
 test_df["vector"] = test_df["text"].apply(
 lambda x: vectorize(x))
 X_train = np.stack(train_df["vector"].values, axis=0)
 X_test = np.stack(test_df["vector"].values, axis=0)
 y_train = train_df["label"].to_numpy()
 y_test = test_df["label"].to_numpy()
 return (X_train, X_test, y_train, y_test)

In [15]:
def train_classifier(X_train, y_train):
 clf = LogisticRegression(C=0.1)
 clf = clf.fit(X_train, y_train)
 return clf

In [16]:
def test_classifier(test_df, clf):
 test_df["prediction"] = test_df["vector"].apply(
 lambda x: clf.predict([x])[0])
 print(classification_report(test_df["label"], 
 test_df["prediction"]))
