<a href="https://colab.research.google.com/github/ssooni/sentiment_analysis/blob/main/sentiment_analysis(English).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/dataset', force_remount=True)

Drive not mounted, so nothing to flush and unmount.
Mounted at /dataset


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext.vocab import Vectors
from torchtext.data import TabularDataset, Field
from torchtext import data, datasets

import random
import os
import numpy as np
import pandas as pd
import json
from gensim.models import Word2Vec, KeyedVectors
from torchtext.vocab import GloVe

BATCH_SIZE = 64
lr = 0.00001
EPOCHS = 200
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")

In [None]:
!ls -la "/dataset/My Drive/friends"

total 2774
-rw------- 1 root root  229392 Aug  6  2018 friends_dev.json
-rw------- 1 root root  544341 Aug  6  2018 friends_test.json
-rw------- 1 root root 2065470 Aug  6  2018 friends_train.json


In [None]:
def reform(filename):
  with open(filename) as json_file:
      json_data = json.load(json_file)
  reform_list = list()
  count = 0
  header = ['speaker_utterance', 'emotion', 'annotation']
  # [neutral, joy, sadness, fear, anger, surprise, disgust]
  for i, d_list in enumerate(json_data):
    speaker_dict = {}
    count = 0
    bf_talk = ""
    for j, d in enumerate(d_list):
      l = list()
      l.append(d['speaker'] + " " + d['utterance'])
      l.append(d['emotion'])
      l.append(d['annotation'])
 
#     bf_talk = speaker_dict[d['speaker']] + " " + d['utterance'] + " "


      reform_list.append(l)

  df = pd.DataFrame(reform_list, columns=header)
  print(df.head())
  return df

In [None]:
reform("/dataset/My Drive/friends/friends_train.json").to_csv("./friends_train.csv", index=None)
reform("/dataset/My Drive/friends/friends_dev.json").to_csv("./friends_dev.csv", index=None)
reform("/dataset/My Drive/friends/friends_test.json").to_csv("./friends_test.csv", index=None) 

                                   speaker_utterance   emotion annotation
0  Chandler also I was the point person on my com...   neutral    4100000
1   The Interviewer You mustve had your hands full.   neutral    5000000
2                   Chandler That I did. That I did.   neutral    5000000
3  The Interviewer So lets talk a little bit abo...   neutral    5000000
4                    Chandler My duties?  All right.  surprise    2000030
                                   speaker_utterance      emotion annotation
0  Phoebe Oh my God, hes lost it. Hes totally l...  non-neutral    0002120
1                                       Monica What?     surprise    1000130
2  Ross Or! Or, we could go to the bank, close ou...      neutral    3000200
3                          Chandler Youre a genius!          joy    0500000
4       Joey Aww, man, now we wont be bank buddies!      sadness    0040100
                                   speaker_utterance      emotion annotation
0  Mark Why do al

In [None]:
!ls -la

total 1100
drwxr-xr-x 1 root root   4096 Dec 23 11:00 .
drwxr-xr-x 1 root root   4096 Dec 23 10:40 ..
drwxr-xr-x 1 root root   4096 Dec 21 17:29 .config
-rw-r--r-- 1 root root  86609 Dec 23 11:00 friends_dev.csv
-rw-r--r-- 1 root root 212174 Dec 23 11:00 friends_test.csv
-rw-r--r-- 1 root root 805555 Dec 23 11:00 friends_train.csv
drwxr-xr-x 1 root root   4096 Dec 21 17:29 sample_data


In [None]:
i = Field(sequential=False, unk_token=None)
j = Field(sequential=False, unk_token=None)
speaker = Field(sequential=False, unk_token=None)
utterance = Field(sequential=True, use_vocab=True, tokenize = "spacy", lower=True, init_token = '<sos>', eos_token = '<eos>', batch_first=True)
emotion = Field(sequential=False, use_vocab=True, batch_first=True, unk_token=None)
annotation = Field(sequential=False, use_vocab=True, batch_first=True, unk_token=None)

fields = [('utterance', utterance), ('emotion', emotion), ('annotation', annotation)]
#fields = [('i', i), ('j', j), ('speaker', speaker), ('utterance', utterance), ('emotion', emotion), ('annotation', annotation)]
train_data, val_data, test_data = TabularDataset.splits(path="./",  train='friends_train.csv', test='friends_test.csv', validation='friends_dev.csv', format='csv', fields=fields, skip_header=True)

### 단어 사전 생성 
1. 2회 이상 나온 단어만 단어 사전에 수록
2. 형태소 분석기는 Komoran을 사용하였음

In [None]:
#i.build_vocab(train_data) 
#j.build_vocab(train_data) 
#speaker.build_vocab(train_data) 
annotation.build_vocab(train_data) 
utterance.build_vocab(train_data,vectors=GloVe(name='6B', dim=300), min_freq=3) 
emotion.build_vocab(train_data)

vocab_size = len(utterance.vocab)
n_classes = len(emotion.vocab)



print('단어 집합의 크기 : {}'.format(vocab_size))
print('클래스 : {}'.format(n_classes))
print('{}'.format(utterance.vocab.freqs))

output = open('./vocab_list.pkl', 'wb')
torch.save(utterance.vocab, output)
output.close()


단어 집합의 크기 : 2115
클래스 : 8


## GRU 


In [None]:
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding.from_pretrained(utterance.vocab.vectors, freeze=False)
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim, num_layers=self.n_layers, batch_first=True, bidirectional=False)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0)) 
        x, _= self.gru(x, h_0) 
        h_t = x[:,-1,:]         
        # self.dropout(h_t)
        logit = self.out(h_t)     
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()

def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.utterance.to(DEVICE), batch.emotion.to(DEVICE)
        optimizer.zero_grad()
        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

def evaluate(model, val_iter):
    model.eval()
    corrects, total_loss = 0, 0
    
    for batch in val_iter:
        x, y = batch.utterance.to(DEVICE), batch.emotion.to(DEVICE)
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    # print(logit.max(1)[1].view(y.size()).data)
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

## TRAIN
1. Hidden Layer dimension 조정
2. GRU Layer 수를 조정
3. Loss가 제일 작은 모델을 저장

In [None]:
best_val_loss = None
best_dim = None
best_n_layers = None
i = 5
for hidden_dim in range(100, 150, 50):
    print(hidden_dim, i)
    train_iter, val_iter, test_iter = data.BucketIterator.splits((train_data, val_data, test_data), shuffle=True, batch_size=BATCH_SIZE, repeat=False, sort=False)

    print('훈련 데이터의 미니 배치의 개수 : {}'.format(len(train_iter)))
    print('테스트 데이터의 미니 배치의 개수 : {}'.format(len(test_iter)))
    print('검증 데이터의 미니 배치의 개수 : {}'.format(len(val_iter)))

    # n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p
    model = GRU(i, hidden_dim, vocab_size, 300, n_classes, 0.4).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    loss_list = list()
    for e in range(1, EPOCHS+1):
        train(model, optimizer, train_iter)
        val_loss, val_accuracy = evaluate(model, val_iter)

        loss_list.append([e, i, val_loss])
        # 검증 오차가 가장 적은 최적의 모델을 저장
        if not best_val_loss or val_loss < best_val_loss:
            if not os.path.isdir("snapshot"):
                os.makedirs("snapshot")
            best_dim = hidden_dim
            best_n_layers = i
            print("Current Best : ", hidden_dim, best_n_layers)
            print("[%d, Epoch: %d, %d] val loss : %5.2f | val accuracy : %5.2f" % (1, e,hidden_dim, val_loss, val_accuracy))

            torch.save(model.state_dict(), './snapshot/txtclassification.pt')
            best_val_loss = val_loss

100 5
훈련 데이터의 미니 배치의 개수 : 166
테스트 데이터의 미니 배치의 개수 : 44
검증 데이터의 미니 배치의 개수 : 19
Current Best :  100 5
[1, Epoch: 1, 100] val loss :  2.00 | val accuracy : 41.68
Current Best :  100 5
[1, Epoch: 2, 100] val loss :  1.77 | val accuracy : 41.68
Current Best :  100 5
[1, Epoch: 3, 100] val loss :  1.71 | val accuracy : 41.68
Current Best :  100 5
[1, Epoch: 4, 100] val loss :  1.70 | val accuracy : 41.68
Current Best :  100 5
[1, Epoch: 5, 100] val loss :  1.70 | val accuracy : 41.68
Current Best :  100 5
[1, Epoch: 6, 100] val loss :  1.70 | val accuracy : 41.68
Current Best :  100 5
[1, Epoch: 7, 100] val loss :  1.69 | val accuracy : 41.68
Current Best :  100 5
[1, Epoch: 8, 100] val loss :  1.69 | val accuracy : 41.68
Current Best :  100 5
[1, Epoch: 9, 100] val loss :  1.68 | val accuracy : 41.68
Current Best :  100 5
[1, Epoch: 10, 100] val loss :  1.68 | val accuracy : 41.68
Current Best :  100 5
[1, Epoch: 11, 100] val loss :  1.68 | val accuracy : 41.85
Current Best :  100 5
[1, Epoc

KeyboardInterrupt: ignored

In [None]:
model = GRU(best_n_layers, best_dim, vocab_size, 300, n_classes, 0.4).to(DEVICE)
model.load_state_dict(torch.load('./snapshot/txtclassification.pt'))
test_loss, test_acc = evaluate(model, test_iter)
print('테스트 오차: %5.2f | 테스트 정확도: %5.2f' % (test_loss, test_acc))

테스트 오차:  1.36 | 테스트 정확도: 52.46


In [None]:
def reform2(filename):
  dataset_ = pd.read_csv(filename, engine="python", encoding="utf-8")
  dataset_.reset_index(inplace=True)
  del dataset_["level_0"]
  arr = dataset_.to_numpy()
 
  reform_list = list()
  header = ['speaker_utterance']
  current_i = -1
  for i, d_list in enumerate(arr):
    if current_i is not d_list[0]:
      current_i = d_list[0]
      speaker_dict = {}
      count = 0
      bf_talk = ""
    


    l = list()
    l.append(bf_talk + d_list[2] + " " + d_list[3])
    bf_talk = d_list[2] + " " + d_list[3] + " "
    reform_list.append(l)

  df = pd.DataFrame(reform_list, columns=header)
  print(df.head())
  return df

In [None]:
reform2("/dataset/My Drive/en_data.csv").to_csv("/dataset/My Drive/en_data_utf8.csv", index=None)

                                   speaker_utterance
0               Phoebe Alright, whadyou do with him?
1  Phoebe Alright, whadyou do with him? Monica Oh...
2  Monica Oh! You're awake! Joey Then you gotta c...
3  Joey Then you gotta come clean with Ma! This i...
4  Mr. Tribbiani Yeah, but this is Joey I don't w...


In [None]:
def predict(model, val_iter):
    model.eval()
    print("predict")
    predict_list = list()
    count = 0
    for batch in val_iter:
      indexed = [utterance.vocab.stoi[t] for t in batch.speaker_utterance]          #convert to integer sequence
      length = [len(indexed)]                                    #compute no. of words
      tensor = torch.LongTensor(indexed).to(DEVICE)              #convert to tensor
      tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
      length_tensor = torch.LongTensor(length)                   #convert to tensor
      prediction = model(tensor)                  #prediction 
      predict_list.append([count, emotion.vocab.itos[prediction.max(1)[1].view([1]).cpu().data.numpy()[0]]])
      count += 1
    return predict_list



In [None]:
submit_data = TabularDataset(path="/dataset/My Drive/en_data_utf8.csv", format='csv', fields=[('speaker_utterance', utterance)], skip_header=True)

In [None]:
a = predict(model, submit_data)
print(a)
pd.DataFrame(a, columns=["id", "Predicted"]).to_csv("sample.csv", index=None)

predict
[[0, 'neutral'], [1, 'neutral'], [2, 'non-neutral'], [3, 'non-neutral'], [4, 'neutral'], [5, 'neutral'], [6, 'neutral'], [7, 'neutral'], [8, 'neutral'], [9, 'neutral'], [10, 'neutral'], [11, 'neutral'], [12, 'neutral'], [13, 'neutral'], [14, 'non-neutral'], [15, 'non-neutral'], [16, 'non-neutral'], [17, 'non-neutral'], [18, 'non-neutral'], [19, 'neutral'], [20, 'neutral'], [21, 'neutral'], [22, 'non-neutral'], [23, 'non-neutral'], [24, 'non-neutral'], [25, 'non-neutral'], [26, 'non-neutral'], [27, 'anger'], [28, 'anger'], [29, 'anger'], [30, 'non-neutral'], [31, 'neutral'], [32, 'neutral'], [33, 'neutral'], [34, 'non-neutral'], [35, 'non-neutral'], [36, 'neutral'], [37, 'neutral'], [38, 'non-neutral'], [39, 'neutral'], [40, 'neutral'], [41, 'neutral'], [42, 'non-neutral'], [43, 'non-neutral'], [44, 'non-neutral'], [45, 'non-neutral'], [46, 'neutral'], [47, 'neutral'], [48, 'neutral'], [49, 'neutral'], [50, 'neutral'], [51, 'neutral'], [52, 'neutral'], [53, 'neutral'], [54, 'neu

In [None]:
from google.colab import files
files.download('sample.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>