Setup

In [46]:
!pip install transformers



In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

Import DF1 dataframe

In [48]:
from google.colab import files
uploaded = files.upload()
import io

Saving DF1.csv to DF1 (4).csv


In [49]:
# data_7a = pd.read_csv("DF1_team7a.csv")
# data_7b = pd.read_csv("DF1_team7b.csv")
# data_7c = pd.read_csv("DF1_team7c.csv")
# data_7c = data_7c.dropna()
# DF1 = pd.concat([data_7a, data_7b, data_7c], ignore_index=True)
# DF1 = DF1.drop(columns=['Sub-category', 'URL'])
# DF1.to_csv('DF1.csv', index=False)
# files.download('DF1.csv')
DF1 = pd.read_csv("DF1.csv")
DF1.head(1818)

Unnamed: 0,Title,Category,Post Text
0,Testing requirements in toys and games category,Amazon Custom,My product is domino sets for adults. I am the...
1,Food and Drug Administration,Amazon Custom,"Hello, I am on my way to becoming an Amazon de..."
2,Images Storage for mass upload,Amazon Custom,When uploading products with the spreadsheet w...
3,Prepaid Return Label program email clarification,Amazon Custom,You may have received an email from Amazon inf...
4,Pulling Amazon Custom Personalizatoin Data fro...,Amazon Custom,The problem: Currently Amazon gives you links ...
...,...,...,...
1810,A suspended account which 20% from his profits...,Account Health,20% of the profits are for charity. Can I expe...
1811,Need help with account suspension Appeal,Account Health,"Hello members,\nI am requesting for some advic..."
1812,Restricted Product Policy Violations,Account Health,So I am a new seller and just noticed that i h...
1813,8k on hold - sourcing from Home Depot - inauth...,Account Health,"Hi,\nIm a new seller on Amazon and have been s..."


Remove stop words from DF1

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
DF1['Post Text'] = DF1['Post Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
DF1.head()

Unnamed: 0,Title,Category,Post Text
0,Testing requirements in toys and games category,Amazon Custom,My product domino sets adults. I exclusive bra...
1,Food and Drug Administration,Amazon Custom,"Hello, I way becoming Amazon dealer. My accoun..."
2,Images Storage for mass upload,Amazon Custom,When uploading products spreadsheet found best...
3,Prepaid Return Label program email clarification,Amazon Custom,You may received email Amazon informing requir...
4,Pulling Amazon Custom Personalizatoin Data fro...,Amazon Custom,The problem: Currently Amazon gives links cust...


DistilBERT setup

In [50]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Process dataframe (post text)

In [None]:
# Tokenization
tokenized = DF1["Post Text"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=150)))

In [None]:
# Padding
max_len = 150
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
np.array(padded).shape

(1815, 150)

In [None]:
# Attention mask
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1815, 150)

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

Save processed post text

In [None]:
features = last_hidden_states[0][:,0,:].numpy()
pd.DataFrame(features).to_csv('BERT_data_nostop.csv', index=False)
files.download('BERT_data_nostop.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Import processed post text (and DF1) from files

In [51]:
uploaded = files.upload()

Saving BERT_data_nostop.csv to BERT_data_nostop (2).csv


In [66]:
features = pd.read_csv("BERT_data_nostop.csv")

Train model: split into training and testing groups

In [71]:
labels = DF1["Category"]
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

Find best parameters for classification (logistic regression)

In [74]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scores:  0.8214635854341736


Logistic regression for classification

In [77]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [78]:
lr_clf.score(test_features, test_labels)

0.8744493392070485

KNN for classification

In [79]:
#Import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=10, weights='distance')

#Train the model using the training sets
knn.fit(train_features, train_labels)

#Predict the response for test dataset
cat_predict = knn.predict(test_features)

In [80]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
metrics.accuracy_score(test_labels, cat_predict)

0.7533039647577092

Random forest for classification

In [81]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(train_features, train_labels)

cat_predict=clf.predict(test_features)

In [82]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
metrics.accuracy_score(test_labels, cat_predict)

0.7775330396475771