In [1]:
import pandas as pd
import nltk as nl
import numpy as np

# Steps required for sentiment analysis
## 1. Data Cleaning: Tokenization, stopword removal, stemming
## 2. Vectorization: Text vectorization
## 3. Text Classification: Based on content 

# Data Cleaning

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
df = fetch_20newsgroups()

In [4]:
type(df)

sklearn.utils.Bunch

In [5]:
rdf = df.data[:4]
rdf

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

## Stage 1: Convert into lower text

In [6]:
clean_df1 = []
def to_lower(data):
    for words in data:
        clean_df1.append(str.lower(words))

In [7]:
to_lower(rdf)

## Stage 2: Tokenization

In [8]:
ct2 = []
from nltk.tokenize import sent_tokenize,word_tokenize
nl.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prajualpillai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Sentance Tokenize: The sentances are broken up and stored as a 2d array

In [9]:
sent_tok = []
for sent in clean_df1:
    sent = sent_tokenize(sent)
    sent_tok.append(sent)
sent_tok

[["from: lerxst@wam.umd.edu (where's my thing)\nsubject: what car is this!?",
  'nntp-posting-host: rac3.wam.umd.edu\norganization: university of maryland, college park\nlines: 15\n\n i was wondering if anyone out there could enlighten me on this car i saw\nthe other day.',
  'it was a 2-door sports car, looked to be from the late 60s/\nearly 70s.',
  'it was called a bricklin.',
  'the doors were really small.',
  'in addition,\nthe front bumper was separate from the rest of the body.',
  'this is \nall i know.',
  'if anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
  'thanks,\n- il\n   ---- brought to you by your neighborhood lerxst ----'],
 ['from: guykuo@carson.u.washington.edu (guy kuo)\nsubject: si clock poll - final call\nsummary: final call for si clock reports\nkeywords: si,acceleration,clock,upgrade\narticle-i.d.',
  ': shelley.1qvfo9innc3s\norganization

Word Tokenize: Each word in each sentance is broken up and is stored as a 2d array 

In [10]:
# Word Tokenize
ct2 = [word_tokenize(i) for i in clean_df1]


In [11]:
ct2

[['from',
  ':',
  'lerxst',
  '@',
  'wam.umd.edu',
  '(',
  'where',
  "'s",
  'my',
  'thing',
  ')',
  'subject',
  ':',
  'what',
  'car',
  'is',
  'this',
  '!',
  '?',
  'nntp-posting-host',
  ':',
  'rac3.wam.umd.edu',
  'organization',
  ':',
  'university',
  'of',
  'maryland',
  ',',
  'college',
  'park',
  'lines',
  ':',
  '15',
  'i',
  'was',
  'wondering',
  'if',
  'anyone',
  'out',
  'there',
  'could',
  'enlighten',
  'me',
  'on',
  'this',
  'car',
  'i',
  'saw',
  'the',
  'other',
  'day',
  '.',
  'it',
  'was',
  'a',
  '2-door',
  'sports',
  'car',
  ',',
  'looked',
  'to',
  'be',
  'from',
  'the',
  'late',
  '60s/',
  'early',
  '70s',
  '.',
  'it',
  'was',
  'called',
  'a',
  'bricklin',
  '.',
  'the',
  'doors',
  'were',
  'really',
  'small',
  '.',
  'in',
  'addition',
  ',',
  'the',
  'front',
  'bumper',
  'was',
  'separate',
  'from',
  'the',
  'rest',
  'of',
  'the',
  'body',
  '.',
  'this',
  'is',
  'all',
  'i',
  'know',
  '

## Removing punctuations and extra expressions

In [12]:
import re
cl3 = []
for words in ct2:
    c = []
    for w in words:
        res = re.sub(r'[^\w\s]',"",w)
        if res != "":
            c.append(res)
    cl3.append(c)


In [13]:
cl3

[['from',
  'lerxst',
  'wamumdedu',
  'where',
  's',
  'my',
  'thing',
  'subject',
  'what',
  'car',
  'is',
  'this',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'of',
  'maryland',
  'college',
  'park',
  'lines',
  '15',
  'i',
  'was',
  'wondering',
  'if',
  'anyone',
  'out',
  'there',
  'could',
  'enlighten',
  'me',
  'on',
  'this',
  'car',
  'i',
  'saw',
  'the',
  'other',
  'day',
  'it',
  'was',
  'a',
  '2door',
  'sports',
  'car',
  'looked',
  'to',
  'be',
  'from',
  'the',
  'late',
  '60s',
  'early',
  '70s',
  'it',
  'was',
  'called',
  'a',
  'bricklin',
  'the',
  'doors',
  'were',
  'really',
  'small',
  'in',
  'addition',
  'the',
  'front',
  'bumper',
  'was',
  'separate',
  'from',
  'the',
  'rest',
  'of',
  'the',
  'body',
  'this',
  'is',
  'all',
  'i',
  'know',
  'if',
  'anyone',
  'can',
  'tellme',
  'a',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'of',
  'production',
  'where',
 

## Removing stopwords

In [14]:
nl.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prajualpillai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from nltk.corpus import stopwords

In [16]:
cl4 = []
for words in cl3:
    w = []
    for word in words:
        if word not in stopwords.words('english'):
            w.append(word)
    cl4.append(w)

In [17]:
cl4

[['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'lines',
  '15',
  'wondering',
  'anyone',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  '2door',
  'sports',
  'car',
  'looked',
  'late',
  '60s',
  'early',
  '70s',
  'called',
  'bricklin',
  'doors',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'production',
  'car',
  'made',
  'history',
  'whatever',
  'info',
  'funky',
  'looking',
  'car',
  'please',
  'email',
  'thanks',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['guykuo',
  'carsonuwashingtonedu',
  'guy',
  'kuo',
  'subject',
  'si',
  'clock',
  'poll',
  'final',
  'call',
  'summary',
  'final',
  'call',
  'si',
  'clock',
  'reports',
  'keywords',
  'si',
  'acceleration',
  'clock',

## Stemming

In [18]:
from nltk.stem.porter import PorterStemmer

In [19]:
port = PorterStemmer()

In [20]:
a = [port.stem(i) for i in ['reading','wash','tilts']]
a

['read', 'wash', 'tilt']

In [21]:
cl5 = []
for words in cl4:
    w = []
    for word in words:
        w.append(port.stem(word))
    cl5.append(w)

In [22]:
cl5

[['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organ',
  'univers',
  'maryland',
  'colleg',
  'park',
  'line',
  '15',
  'wonder',
  'anyon',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  '2door',
  'sport',
  'car',
  'look',
  'late',
  '60',
  'earli',
  '70',
  'call',
  'bricklin',
  'door',
  'realli',
  'small',
  'addit',
  'front',
  'bumper',
  'separ',
  'rest',
  'bodi',
  'know',
  'anyon',
  'tellm',
  'model',
  'name',
  'engin',
  'spec',
  'year',
  'product',
  'car',
  'made',
  'histori',
  'whatev',
  'info',
  'funki',
  'look',
  'car',
  'pleas',
  'email',
  'thank',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['guykuo',
  'carsonuwashingtonedu',
  'guy',
  'kuo',
  'subject',
  'si',
  'clock',
  'poll',
  'final',
  'call',
  'summari',
  'final',
  'call',
  'si',
  'clock',
  'report',
  'keyword',
  'si',
  'acceler',
  'clock',
  'upgrad',
  'articleid',
  'shelley1qvfo9innc3',


## Lemmetisation

In [25]:
from nltk.stem import WordNetLemmatizer as wnl

In [26]:
wnet = wnl()

In [28]:
nl.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/prajualpillai/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [33]:
lem = []
for words in cl5:
    w = []
    for word in words:
        w.append(wnet.lemmatize(word))
    lem.append(w)
lem

[['lerxst',
  'wamumdedu',
  'thing',
  'subject',
  'car',
  'nntppostinghost',
  'rac3wamumdedu',
  'organ',
  'univers',
  'maryland',
  'colleg',
  'park',
  'line',
  '15',
  'wonder',
  'anyon',
  'could',
  'enlighten',
  'car',
  'saw',
  'day',
  '2door',
  'sport',
  'car',
  'look',
  'late',
  '60',
  'earli',
  '70',
  'call',
  'bricklin',
  'door',
  'realli',
  'small',
  'addit',
  'front',
  'bumper',
  'separ',
  'rest',
  'bodi',
  'know',
  'anyon',
  'tellm',
  'model',
  'name',
  'engin',
  'spec',
  'year',
  'product',
  'car',
  'made',
  'histori',
  'whatev',
  'info',
  'funki',
  'look',
  'car',
  'plea',
  'email',
  'thank',
  'il',
  'brought',
  'neighborhood',
  'lerxst'],
 ['guykuo',
  'carsonuwashingtonedu',
  'guy',
  'kuo',
  'subject',
  'si',
  'clock',
  'poll',
  'final',
  'call',
  'summari',
  'final',
  'call',
  'si',
  'clock',
  'report',
  'keyword',
  'si',
  'acceler',
  'clock',
  'upgrad',
  'articleid',
  'shelley1qvfo9innc3',
 

In [32]:
print(cl5[:1])

[['lerxst', 'wamumdedu', 'thing', 'subject', 'car', 'nntppostinghost', 'rac3wamumdedu', 'organ', 'univers', 'maryland', 'colleg', 'park', 'line', '15', 'wonder', 'anyon', 'could', 'enlighten', 'car', 'saw', 'day', '2door', 'sport', 'car', 'look', 'late', '60', 'earli', '70', 'call', 'bricklin', 'door', 'realli', 'small', 'addit', 'front', 'bumper', 'separ', 'rest', 'bodi', 'know', 'anyon', 'tellm', 'model', 'name', 'engin', 'spec', 'year', 'product', 'car', 'made', 'histori', 'whatev', 'info', 'funki', 'look', 'car', 'pleas', 'email', 'thank', 'il', 'brought', 'neighborhood', 'lerxst']]


# Classification

## Naive Bayes Algorithm

In [37]:
from sklearn.preprocessing import LabelEncoder as le
from sklearn.model_selection import train_test_split as tts

# Sentiment Analysis

## Vectorize

In [76]:
from sklearn.feature_extraction.text import CountVectorizer as cv
cv1 = cv(ngram_range=(1,2))

In [79]:
fin = cv1.fit_transform(cl5[0]).toarray()

In [80]:
fin

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [82]:
print(cv1.get_feature_names())

['15', '2door', '60', '70', 'addit', 'anyon', 'bodi', 'bricklin', 'brought', 'bumper', 'call', 'car', 'colleg', 'could', 'day', 'door', 'earli', 'email', 'engin', 'enlighten', 'front', 'funki', 'histori', 'il', 'info', 'know', 'late', 'lerxst', 'line', 'look', 'made', 'maryland', 'model', 'name', 'neighborhood', 'nntppostinghost', 'organ', 'park', 'pleas', 'product', 'rac3wamumdedu', 'realli', 'rest', 'saw', 'separ', 'small', 'spec', 'sport', 'subject', 'tellm', 'thank', 'thing', 'univers', 'wamumdedu', 'whatev', 'wonder', 'year']


# Using BERT

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [9]:
tk = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [59]:
tk1 = tk.encode("travando...pelo valor ta Boa.",return_tensors='pt')
r = model(tk1)
r.logits
int(torch.argmax(r.logits))+1

3

In [60]:
tk1

tensor([[  101, 11234, 12459, 10351,   119,   119,   119, 12108, 17859, 10546,
         28986,   119,   102]])

In [62]:
df = pd.read_csv("/Users/prajualpillai/Desktop/work/archive/olist_order_reviews_dataset.csv")
df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [64]:
df1 = df[df["review_comment_message"].isnull()==False]
df1.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53
9,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,recomendo,aparelho eficiente. no site a marca do aparelh...,2018-05-22 00:00:00,2018-05-23 16:45:47
12,4b49719c8a200003f700d3d986ea1a19,9d6f15f95d01e79bd1349cc208361f09,4,,"Mas um pouco ,travando...pelo valor ta Boa.\r\n",2018-02-16 00:00:00,2018-02-20 10:52:22
15,3948b09f7c818e2d86c9a546758b2335,e51478e7e277a83743b6f9991dbfa3fb,5,Super recomendo,"Vendedor confiável, produto ok e entrega antes...",2018-05-23 00:00:00,2018-05-24 03:00:01


In [65]:
def senan(txt):
    tk2 = tk.encode(txt,return_tensors="pt")
    r = model(tk2)
    r.logits
    return(int(torch.argmax(r.logits))+1)

In [72]:
d = {"Score":[]}

In [None]:
for i in df1["review_comment_message"]:
    d["Score"].append(senan(i))

In [83]:
d

{'Score': [3,
  5,
  3,
  3,
  4,
  1,
  1,
  5,
  1,
  5,
  5,
  1,
  1,
  4,
  5,
  5,
  5,
  1,
  5,
  5,
  5,
  2,
  1,
  4,
  3,
  4,
  5,
  5,
  5,
  1,
  5,
  1,
  3,
  1,
  5,
  5,
  1,
  4,
  1,
  5,
  4,
  5,
  4,
  2,
  5,
  1,
  5,
  1,
  5,
  5,
  3,
  5,
  1,
  1,
  5,
  5,
  3,
  5,
  1,
  1,
  1,
  5,
  5,
  3,
  3,
  5,
  2,
  1,
  1,
  5,
  3,
  5,
  5,
  1,
  5,
  4,
  5,
  1,
  5,
  3,
  3,
  2,
  5,
  1,
  1,
  5,
  1,
  5,
  1,
  1,
  1,
  5,
  4,
  4,
  1,
  4,
  3,
  5,
  5,
  1,
  5,
  1,
  1,
  5,
  1,
  1,
  5,
  5,
  2,
  2,
  5,
  4,
  4,
  5,
  1,
  1,
  1,
  2,
  5,
  5,
  1,
  5,
  4,
  1,
  5,
  1,
  1,
  4,
  5,
  4,
  3,
  1,
  5,
  1,
  5,
  5,
  5,
  1,
  5,
  5,
  5,
  1,
  1,
  1,
  5,
  2,
  3,
  5,
  2,
  4,
  5,
  2,
  3,
  5,
  1,
  1,
  1,
  2,
  5,
  5,
  1,
  1,
  1,
  4,
  1,
  5,
  4,
  1,
  5,
  3,
  1,
  5,
  4,
  1,
  5,
  4,
  3,
  5,
  5,
  5,
  1,
  4,
  5,
  3,
  1,
  5,
  3,
  4,
  4,
  1,
  5,
  3,
  1,
  1,
  5,
  5,
  5,
  5,
 

# Transfer Learning through BERT

In [8]:
from transformers import AutoModelForPreTraining  # Or BertForPreTraining for loading pretraining heads
model = AutoModelForPreTraining.from_pretrained('neuralmind/bert-base-portuguese-cased')
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)


Downloading: 100%|██████████| 647/647 [00:00<00:00, 157kB/s]
Downloading: 100%|██████████| 438M/438M [01:11<00:00, 6.13MB/s]
Some weights of BertForPreTraining were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading: 100%|██████████| 43.0/43.0 [00:00<00:00, 10.9kB/s]
Downloading: 100%|██████████| 210k/210k [00:00<00:00, 563kB/s]
Downloading: 100%|██████████| 2.00/2.00 [00:00<00:00, 590B/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 26.3kB/s]


In [6]:
from transformers import pipeline  #for accessing pretrained model

In [11]:
x_tr = tk(X_train,truncation=True,padding=True)
x_tst = tk(X_test,truncation=True,padding=True)

NameError: name 'X_train' is not defined

In [10]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

#model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

NameError: name 'train_dataset' is not defined