In [55]:
import pandas as pd
import cudf as cd
import numpy as np
import cupy as cp
import cuml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from matplotlib import pyplot as plt
import seaborn as sns
import eli5

In [56]:
train = pd.read_csv('../input/amazon-pet-product-reviews-classification/train.csv', index_col='id').fillna(' ')
valid = pd.read_csv('../input/amazon-pet-product-reviews-classification/valid.csv', index_col='id').fillna(' ')
test = pd.read_csv('../input/amazon-pet-product-reviews-classification/test.csv', index_col='id').fillna(' ')

In [57]:
train.head()

I'll be validating with train + validation files.

In [58]:
train_val = pd.concat([train, valid])

In [59]:
sns.countplot(train_val['label']);
plt.title('Train+val: Target distribution');

We can see that test texts are in general shorter.

In [60]:
plt.subplots(1, 2)
plt.subplot(1, 2, 1)
train_val['text'].apply(lambda x: len(x.split())).plot(kind='hist');
plt.yscale('log');
plt.title('Train & val');
plt.subplot(1, 2, 2)
test['text'].apply(lambda x: len(x.split())).plot(kind='hist');
plt.yscale('log');
plt.title('Test');

I'll be using a Tf-Idf vectorizer.

In [61]:
text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=150000)

In [62]:
%%time
X_train_text = text_transformer.fit_transform(train_val['text'])
X_test_text = text_transformer.transform(test['text'])

In [63]:
X_train_text.shape, X_test_text.shape

As for the model, I picked logistic regression.

In [64]:
logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial', random_state=17, n_jobs=4)

*Cross-validation*

In [65]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

In [66]:
%%time
cv_results = cross_val_score(logit, X_train_text, train_val['label'], cv=skf, scoring='f1_micro')

In [67]:
print(cv_results)
print(cv_results.mean())

training the model on train + val.

In [68]:
%%time
logit.fit(X_train_text, train_val['label'])

*Trying to interpret model weights with ELI5 - look reasonable.*

In [69]:
eli5.show_weights(estimator=logit, 
                  feature_names= list(text_transformer.get_feature_names_out()),
                 top=(50, 5))

*Preparing submission.*

In [70]:
test_preds = logit.predict(X_test_text)

In [71]:
pd.DataFrame(test_preds, columns=['label']).head()

In [72]:
pd.DataFrame(test_preds, columns=['label']).to_csv('logit_tf_idf_starter_submission.csv',
                                                  index_label='id')