# NLP Basic Assignment
## 과제 : spam.csv를 활용하여 유의미한 해석을 도출해주세요!

In [None]:
!pip install transformers
!pip install huggingface_hub
!pip install gensim=='3.8.3'
!pip install -U sentence-transformers
!pip install accelerate -U
!pip install datasets
!pip install evaluate

In [3]:
import pandas as pd

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Data
- 보시면 아시다시피 spam.csv는 라벨이 있는 데이터입니다.
- 7주차 주제가 텍스트 기초인만큼 텍스트만 활용하셔도 되고 라벨까지 활용하셔서 모델을 돌려보셔도 좋습니다.

In [5]:
spam = pd.read_csv('/content/drive/MyDrive/tobigs/Data/nlp_basic.csv')

In [6]:
spam.iloc[5]['v2']

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"

In [7]:
spam.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
len(spam)

5572

## Tokenizing


In [9]:
import nltk

In [10]:
# 예시 코드 코드
from nltk.tokenize import word_tokenize

nltk.download('punkt')
word_tokenize(spam.iloc[5]['v2'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['FreeMsg',
 'Hey',
 'there',
 'darling',
 'it',
 "'s",
 'been',
 '3',
 'week',
 "'s",
 'now',
 'and',
 'no',
 'word',
 'back',
 '!',
 'I',
 "'d",
 'like',
 'some',
 'fun',
 'you',
 'up',
 'for',
 'it',
 'still',
 '?',
 'Tb',
 'ok',
 '!',
 'XxX',
 'std',
 'chgs',
 'to',
 'send',
 ',',
 'å£1.50',
 'to',
 'rcv']

In [11]:
# BERT Tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')

In [12]:
target_str = spam.iloc[5]['v2']
target_str

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"

In [24]:
encoded = tokenizer(target_str)
encoded

{'input_ids': [101, 2489, 5244, 2290, 4931, 2045, 9548, 2009, 1005, 1055, 2042, 1017, 2733, 1005, 1055, 2085, 1998, 2053, 2773, 2067, 999, 1045, 1005, 1040, 2066, 2070, 4569, 2017, 2039, 2005, 2009, 2145, 1029, 26419, 7929, 999, 22038, 2595, 2358, 2094, 10381, 5620, 2000, 4604, 1010, 1037, 29646, 2487, 1012, 2753, 2000, 22110, 2615, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [25]:
for id in encoded['input_ids']:
    print(f'"{tokenizer.decode(id)}"', end = ', ')

"[CLS]", "free", "##ms", "##g", "hey", "there", "darling", "it", "'", "s", "been", "3", "week", "'", "s", "now", "and", "no", "word", "back", "!", "i", "'", "d", "like", "some", "fun", "you", "up", "for", "it", "still", "?", "tb", "ok", "!", "xx", "##x", "st", "##d", "ch", "##gs", "to", "send", ",", "a", "##£", "##1", ".", "50", "to", "rc", "##v", "[SEP]", 

## Embedding

- 수업에서 다룬 임베딩 방법에는 One-hot encoding, CBOW, Skip-gram 등이 있었습니다. 다양한 시도와 '비교' 결과를 함께 적어주세요! 파라미터를 조정해가는 과정도 해석에 도움이 될 수 있겠죠 :)

In [None]:
# Embed the sentence using the pre-trained word2vec
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download

#Loading a Word2Vec model
model = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/wikipedia2vec_arwiki_20180420_100d", filename="arwiki_20180420_100d.txt"))

In [38]:
model.most_similar('hi')

[('go', 0.7739439606666565),
 ('ooh', 0.7566691040992737),
 ('hsb', 0.7495405077934265),
 ('so', 0.7474092841148376),
 ('kore', 0.7431032061576843),
 ('hang', 0.7416202425956726),
 ('starter', 0.7404250502586365),
 ('ashita', 0.7401474714279175),
 ('me', 0.7383584380149841),
 ('tho', 0.7373984456062317)]

In [41]:
for word in target_str.split(' '):
    try:
        # word embedding
        vec = model[word]
        print(f'word: {word}, vec[:10]: {vec[:10]}')
    # some words are not in the vocab
    except Exception as e:
        print(e)

"Key 'FreeMsg' not present"
"Key 'Hey' not present"
word: there, vec[:10]: [-0.8966 -0.0358  0.7854 -0.0293 -0.1218 -0.6679 -0.8995  0.8348  0.4141
 -0.3574]
word: darling, vec[:10]: [-0.7025  0.175   0.0626 -0.0861 -0.4899 -0.2946 -0.3967  0.2077 -0.1214
 -0.0009]
"Key 'it's' not present"
word: been, vec[:10]: [-1.0919 -0.024   1.1496 -0.2857 -0.4854 -0.756  -0.5196  1.1824  0.3442
 -0.3629]
word: 3, vec[:10]: [-0.465  -0.0794  0.6326 -0.69    0.7387  0.244  -0.0094  0.1505 -0.2595
  0.1857]
"Key 'week's' not present"
word: now, vec[:10]: [-0.9067 -0.0573  0.902   0.1639 -0.1583 -0.4109 -0.5007  0.2495 -0.1016
  0.23  ]
word: and, vec[:10]: [-0.7477 -0.1937  0.6352  0.0077  0.0272 -0.4782 -0.5761  0.0088 -0.0032
 -0.2501]
word: no, vec[:10]: [-0.4827  0.5021  1.2702  0.0138  0.2415 -1.0082 -0.7159  0.8782  0.0924
 -0.2948]
word: word, vec[:10]: [-0.7973  0.3063  0.6215 -0.2969  0.205  -0.1138 -0.0753  0.4263  0.4284
  0.7411]
"Key 'back!' not present"
"Key 'I'd' not present"
word: lik

In [29]:
# Embed the sentence using the pre-trained sentence transformer
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

sentence = [target_str]

#Sentences are encoded by calling model.encode()
embedding = model.encode(sentence)
embedding.shape

(1, 384)

## 본인이 도출해낸 해석을 적어주세요!

- 유사도, Wordcloud, 이진 분류 모델, Plot 뭐든 상관없으니 분명하고 인상적인 해석을 적어주시면 됩니다.

In [None]:
# Binary Classification with BERT

In [13]:
# data preprocessing
data = spam.values.tolist()

X = []
Y = []

for y, x in data:
    X.append(x)
    Y.append(y)

print(X[:2])
print(Y[:2])

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...']
['ham', 'ham']


In [14]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [15]:
print(X_train[:2])
print(Y_train[:2])
print(X_test[:2])
print(Y_test[:2])

["No I'm in the same boat. Still here at my moms. Check me out on yo. I'm half naked.", '(Bank of Granite issues Strong-Buy) EXPLOSIVE PICK FOR OUR MEMBERS *****UP OVER 300% *********** Nasdaq Symbol CDGT That is a $5.00 per..']
['ham', 'spam']
['Funny fact Nobody teaches volcanoes 2 erupt, tsunamis 2 arise, hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife Natural disasters just happens', 'I sent my scores to sophas and i had to do secondary application for a few schools. I think if you are thinking of applying, do a research on cost also. Contact joke ogunrinde, her school is one me the less expensive ones']
['ham', 'ham']


In [16]:
label_dict = {'ham':0, 'spam':1}

In [17]:
Y_train = list(map(lambda x: label_dict[x], Y_train))
Y_test = list(map(lambda x: label_dict[x], Y_test))

In [79]:
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import numpy as np

class SpamDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer):
        tokenized_sequences = tokenizer(sequences,
                                      max_length = 128,
                                      truncation=True,
                                        padding=True)
        self.input_ids = tokenized_sequences['input_ids']
        self.labels = labels

    def __getitem__(self, index):
        x = torch.LongTensor(self.input_ids[index])
        y = torch.LongTensor([self.labels[index]])
        return x, y

    def __len__(self):
        return len(self.input_ids)

In [80]:
train_dataset = SpamDataset(X_train, Y_train, tokenizer)
test_dataset = SpamDataset(X_test, Y_test, tokenizer)

In [81]:
train_dataset[1]

(tensor([  101,  1006,  2924,  1997,  9753,  3314,  2844,  1011,  4965,  1007,
         11355,  4060,  2005,  2256,  2372,  1008,  1008,  1008,  1008,  1008,
          2039,  2058,  3998,  1003,  1008,  1008,  1008,  1008,  1008,  1008,
          1008,  1008,  1008,  1008,  1008, 17235,  2850,  4160,  6454,  3729,
         13512,  2008,  2003,  1037,  1002,  1019,  1012,  4002,  2566,  1012,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [82]:
num_epochs = 10
batch_size = 32

In [83]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [115]:
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import math

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(device)

model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-3, no_deprecation_warning=True)
t_total = math.ceil(len(train_loader)) * num_epochs
scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(t_total/10),
            num_training_steps=t_total
        )

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

cuda


In [116]:
from transformers import Trainer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from torch.nn import CrossEntropyLoss

# Train
for epoch in range(10):
    running_loss = 0.
    for samples in train_loader:
        inputs, labels = samples
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(input_ids = inputs, labels= labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    scheduler.step()
    print(f'loss: {running_loss / len(train_loader)}')


loss: 0.7897333370787757
loss: 0.5062902669821466
loss: 0.38969149855630736
loss: 0.24221681569303785
loss: 0.10892420073172876
loss: 0.056110303316797526
loss: 0.05177069606392511
loss: 0.10835636939487553
loss: 0.13557865659157478
loss: 0.11237032514181919


In [126]:
model.eval()
predictions = []
target_labels = []

# Test
for samples in test_loader:
    inputs, labels = samples
    inputs = inputs.to(device)
    labels = labels.to(device)

    outputs = model(input_ids = inputs, labels= labels)
    logits = outputs.logits

    batch_predictions = [int(torch.argmax(logit).cpu()) for logit in logits]
    batch_labels = [int(example) for example in labels]

    predictions += batch_predictions
    target_labels += batch_labels


In [128]:
import evaluate

def compute_acc(predictions, target_labels):
    return (np.array(predictions) == np.array(target_labels)).mean()


def metrics(predictions, target_labels, average):
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")

    acc = compute_acc(predictions, target_labels)
    precision = precision_metric.compute(
        predictions=predictions, references=target_labels, average=average)['precision']
    recall = recall_metric.compute(
        predictions=predictions, references=target_labels, average=average)['recall']
    f1 = f1_metric.compute(
        predictions=predictions, references=target_labels, average=average)['f1']

    return acc, precision, recall, f1

In [129]:
acc, precision, recall, f1 = metrics(predictions, target_labels, 'binary')

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [131]:
print(f'Acc: {acc}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

Acc: 0.9865470852017937
Precision: 1.0
Recall: 0.9
F1: 0.9473684210526316
