In [21]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [22]:
df=pd.read_csv("Final_data.csv")

In [23]:
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Post Text,Category
0,0,Testing requirements in toys and games category,My product is domino sets for adults. I am the...,Amazon Custom
1,1,Food and Drug Administration,"Hello, I am on my way to becoming an Amazon de...",Amazon Custom
2,2,Images Storage for mass upload,When uploading products with the spreadsheet w...,Amazon Custom
3,3,Prepaid Return Label program email clarification,You may have received an email from Amazon inf...,Amazon Custom
4,4,Pulling Amazon Custom Personalizatoin Data fro...,The problem: Currently Amazon gives you links ...,Amazon Custom


In [24]:
subdata = df[["Title", "Category"]]
subdata.head()

Unnamed: 0,Title,Category
0,Testing requirements in toys and games category,Amazon Custom
1,Food and Drug Administration,Amazon Custom
2,Images Storage for mass upload,Amazon Custom
3,Prepaid Return Label program email clarification,Amazon Custom
4,Pulling Amazon Custom Personalizatoin Data fro...,Amazon Custom


In [25]:
subdata['Category'] = subdata['Category'].astype('category')
subdata['Label'] = subdata['Category'].cat.codes.astype('int32')
subdata.sample(5)

Unnamed: 0,Title,Category,Label
1657,Violation of Amazon Marketplace Fair Pricing P...,Account Health,0
399,Question : AMAZONE’s packing service,Fulfillment By Amazon,2
71,Seller Support for Amazon Custom Sellers,Amazon Custom,1
1757,Orders for Out of Stock product,Account Health,0
1690,"Suspended, need advice and clarification",Account Health,0


In [26]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [29]:
tokenized = subdata["Title"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [30]:
tokenized

0       [101, 5604, 5918, 1999, 10899, 1998, 2399, 469...
1                      [101, 2833, 1998, 4319, 3447, 102]
2         [101, 4871, 5527, 2005, 3742, 2039, 11066, 102]
3       [101, 17463, 14326, 2709, 3830, 2565, 10373, 1...
4       [101, 4815, 9733, 7661, 3167, 21335, 3406, 237...
                              ...                        
1810    [101, 1037, 6731, 4070, 2029, 2322, 1003, 2013...
1811       [101, 2342, 2393, 2007, 4070, 8636, 5574, 102]
1812                  [101, 7775, 4031, 3343, 13302, 102]
1813    [101, 1022, 2243, 2006, 2907, 1011, 14768, 612...
1814                 [101, 4070, 26709, 6593, 21967, 102]
Name: Title, Length: 1815, dtype: object

In [34]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

padded

array([[  101,  5604,  5918, ...,     0,     0,     0],
       [  101,  2833,  1998, ...,     0,     0,     0],
       [  101,  4871,  5527, ...,     0,     0,     0],
       ...,
       [  101,  7775,  4031, ...,     0,     0,     0],
       [  101,  1022,  2243, ...,     0,     0,     0],
       [  101,  4070, 26709, ...,     0,     0,     0]])

In [35]:
np.array(padded).shape

(1815, 46)

In [36]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1815, 46)

In [38]:
input_ids = torch.tensor(padded).long()
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [39]:
last_hidden_states

(tensor([[[-0.1835, -0.0325, -0.0348,  ..., -0.3988,  0.1470,  0.3601],
          [ 0.4766,  0.0457, -0.1548,  ..., -0.4201,  0.3834, -0.1362],
          [ 0.1143,  0.2136, -0.0381,  ..., -0.4316, -0.3119,  0.0553],
          ...,
          [ 0.0537, -0.0326,  0.1003,  ..., -0.1306,  0.0262,  0.1715],
          [ 0.1237,  0.0145,  0.3692,  ..., -0.2551,  0.1161,  0.0741],
          [ 0.2589,  0.0167,  0.1903,  ..., -0.2201, -0.0424,  0.0101]],
 
         [[-0.2713, -0.0437, -0.6490,  ..., -0.2204,  0.2402,  0.4417],
          [-0.0381,  0.9230, -0.4835,  ..., -0.2055,  0.3211, -0.7571],
          [-0.3356,  0.3825, -0.5847,  ...,  0.3219,  0.2007,  0.2282],
          ...,
          [ 0.2153, -0.1145, -0.0715,  ...,  0.1220, -0.2260,  0.2741],
          [ 0.2495, -0.1307, -0.0541,  ...,  0.0763, -0.2571,  0.3050],
          [ 0.2494, -0.0476, -0.1152,  ...,  0.0678, -0.2254,  0.2886]],
 
         [[-0.3204, -0.2543, -0.0767,  ..., -0.2676, -0.1091,  0.2475],
          [ 0.1039,  0.0143,

In [40]:
features = last_hidden_states[0][:,0,:].numpy()
features

array([[-0.18352728, -0.03250957, -0.03482962, ..., -0.39881077,
         0.1470425 ,  0.36006016],
       [-0.27133703, -0.04370546, -0.6490413 , ..., -0.22036885,
         0.24016523,  0.44169772],
       [-0.3203987 , -0.2542809 , -0.07665624, ..., -0.26764527,
        -0.10913481,  0.24754797],
       ...,
       [-0.176929  ,  0.03628302, -0.4426356 , ..., -0.21058069,
        -0.02595328,  0.26093438],
       [-0.30844954, -0.19637042, -0.2913593 , ..., -0.02660279,
         0.14479697,  0.5965741 ],
       [-0.4690847 , -0.164107  , -0.15658677, ..., -0.18426396,
         0.01099475,  0.34982154]], dtype=float32)

In [50]:
features = last_hidden_states[0][:,0,:].numpy()

In [51]:
labels = subdata['Label']

In [52]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [53]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [54]:
lr_clf.score(test_features, test_labels)

0.7731277533039648