<a href="https://colab.research.google.com/github/sehan25/ESAA-2022-2/blob/main/Project/OB-team2-project2-final-code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Data & library

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('/content/drive/MyDrive/ESAA/ESAA_OB_2조/Project2/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/ESAA/ESAA_OB_2조/Project2/data/test.csv')
submit = pd.read_csv('/content/drive/MyDrive/ESAA/ESAA_OB_2조/Project2/data/sample_submission.csv')

In [None]:
train.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise


In [None]:
test.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID
0,TEST_0000,Why do all the coffee cups have figures below?,Mark,0
1,TEST_0001,"Oh. It's so Monica can follow. Of this way, if...",Rachell,0
2,TEST_0002,You know what?,Rachell,0
3,TEST_0003,"Come on, Lydia, you can do it.",Joeyy,1
4,TEST_0004,To push!,Joeyy,1


## EDA & Data Preprocessing

### Target

In [None]:
# label encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train.loc[:,'Target'] = le.fit_transform(train.loc[:,'Target'])
train.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,4
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,4
2,TRAIN_0002,That I did. That I did.,Chandler,0,4
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,4
4,TRAIN_0004,My duties? All right.,Chandler,0,6


### Dialogue_ID

In [None]:
train['Dialogue_ID'].unique()

array([   0,    1,    2, ..., 1036, 1037, 1038])

#### Train, Test 데이터 합치기

In [None]:
# test의 Dialogue ID 1039부터 부여
test['Dialogue_ID'] = test['Dialogue_ID'] + 1039

In [None]:
# 데이터 전처리 위해 Train,Test 데이터 합치기
ntrain = train.shape[0]
ntest = test.shape[0]

data_all = pd.concat([train, test])
data_all.reset_index(drop=True, inplace=True)
data_all.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,4.0
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,4.0
2,TRAIN_0002,That I did. That I did.,Chandler,0,4.0
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,4.0
4,TRAIN_0004,My duties? All right.,Chandler,0,6.0


### Speaker

In [None]:
# Speaker 별 특정 감정이 표출된 확률과 횟수로 이루어진 데이터프레임 생성
prob = pd.DataFrame((train.groupby('Speaker')['Target'].value_counts() / train.groupby("Speaker")['Target'].count()))
prob['count'] = train.groupby('Speaker')['Target'].value_counts()
prob.rename(columns={'Target':'Prob'}, inplace=True)
prob.sort_values(by='Prob',ascending=False, inplace=True)
prob.reset_index(inplace=True)

In [None]:
prob.head()

Unnamed: 0,Speaker,Target,Prob,count
0,1st Customer,3,1.0,1
1,Lecturer,4,1.0,1
2,Jade,4,1.0,4
3,Janitor,6,1.0,1
4,Jason,4,1.0,2


In [None]:
prob = prob[prob['Prob']==1]
prob

Unnamed: 0,Speaker,Target,Prob,count
0,1st Customer,3,1.0,1
1,Lecturer,4,1.0,1
2,Jade,4,1.0,4
3,Janitor,6,1.0,1
4,Jason,4,1.0,2
...,...,...,...,...
102,Customer,4,1.0,3
103,Annabelle,4,1.0,3
104,Commercial,4,1.0,1
105,Aunt Lillian,5,1.0,1


In [None]:
prob[prob['count'] > 3][prob['Target']=='neutral'].Speaker.unique()

array([], dtype=object)

In [None]:
train[train['Speaker'].map(lambda x: x[:3]=='Dr.')]['Target'].value_counts()

4    37
0    16
3     6
1     1
2     1
5     1
Name: Target, dtype: int64

In [None]:
train[train['Speaker'].map(lambda x: x[:7]=='Fireman')]['Target'].value_counts()

4    8
6    2
Name: Target, dtype: int64

#### 'Job' column 생성

In [None]:
# 분포가 일정하지 않은 직업과 이름을 제외하여 job_list 생성
job_list = ['The Interviewer','The Museum Official', 'Tour Guide', 'Trudie Styler', 'Stage Director', 'Nurse', 'Policeman']

In [None]:
data_all['Job'] = 0
for i in range(len(data_all)):
  if data_all.loc[i,'Speaker'] in job_list:
    data_all.loc[i,'Job'] = 1 

In [None]:
data_all.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target,Job
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,4.0,0
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,4.0,1
2,TRAIN_0002,That I did. That I did.,Chandler,0,4.0,0
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,4.0,1
4,TRAIN_0004,My duties? All right.,Chandler,0,6.0,0


### Utterance

#### '?','!' column 생성

In [None]:
# ?, ! 존재 여부 열 생성
data_all['?'] = data_all['Utterance'].map(lambda x: int('?' in x))
data_all['!'] = data_all['Utterance'].map(lambda x: int('!' in x))

In [None]:
data_all.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target,Job,?,!
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,4.0,0,0,0
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,4.0,1,0,0
2,TRAIN_0002,That I did. That I did.,Chandler,0,4.0,0,0,0
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,4.0,1,0,0
4,TRAIN_0004,My duties? All right.,Chandler,0,6.0,0,1,0


In [None]:
df_text = data_all[['Utterance']]

#### text cleaning

In [None]:
# 약어 정규화
contractions = {"'cause": 'because', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have',
                "ain't": 'is not', "aren't": 'are not', "can't": 'cannot', "could've": 'could have', "couldn't": 'could not',
                "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hasn't": 'has not', "haven't": 'have not',
                "he'd": 'he would', "he'll": 'he will', "he's": 'he is', "here's": 'here is',
                "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', 
                "i'd": 'i would', "i'd've": 'i would have', "i'll": 'i will', "i'll've": 'i will have', "i'm": 'i am', "i've": 'i have',
                "isn't": 'is not', "it'd": 'it would', "it'd've": 'it would have', "it'll": 'it will', "it'll've": 'it will have', "it's": 'it is',
                "let's": 'let us', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't": 'might not', "mightn't've": 'might not have',
                "must've": 'must have', "mustn't": 'must not', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have',
                "o'clock": 'of the clock', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "sha'n't": 'shall not', "shan't": 'shall not', "shan't've": 'shall not have',
                "she'd": 'she would', "she'd've": 'she would have', "she'll": 'she will', "she'll've": 'she will have', "she's": 'she is',
                "should've": 'should have', "shouldn't": 'should not', "shouldn't've": 'should not have', "so's": 'so as', "so've": 'so have',
                "that'd": 'that would', "that'd've": 'that would have', "that's": 'that is', "there'd": 'there would', "there'd've": 'there would have', "there's": 'there is',
                "they'd": 'they would', "they'd've": 'they would have', "they'll": 'they will', "they'll've": 'they will have', "they're": 'they are', "they've": 'they have',
                "this's": 'this is', "to've": 'to have', "wasn't": 'was not',
                "we'd": 'we would', "we'd've": 'we would have', "we'll": 'we will', "we'll've": 'we will have', "we're": 'we are', "we've": 'we have', "weren't": 'were not',
                "what'll": 'what will', "what'll've": 'what will have', "what're": 'what are', "what's": 'what is', "what've": 'what have', "when's": 'when is', "when've": 'when have',
                "where'd": 'where did', "where's": 'where is', "where've": 'where have', "who'll": 'who will', "who'll've": 'who will have', "who's": 'who is', "who've": 'who have',
                "why's": 'why is', "why've": 'why have', "will've": 'will have', "won't": 'will not', "won't've": 'will not have',
                "would've": 'would have', "wouldn't": 'would not', "wouldn't've": 'would not have',
                "y'all": 'you all', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "y'all're": 'you all are', "y'all've": 'you all have',
                "you'd": 'you would', "you'd've": 'you would have', "you'll": 'you will', "you'll've": 'you will have', "you're": 'you are', "you've": 'you have',

                "'em":'them', "y'know":'you know', "'bout":'about', "thinkin'":'thinking', "name's":'name is', "y'ever":'you ever', "d'know":'do not know', "y'haven't":'you have not',
                "dad's":'dad is', "'em's":'them is', "jason's":'jason is', "everybody's":'everybody is', "let's":'let us', "brother's":'brother is', "fixin's":'fixins', "havin'":'having',
                "c'mon":'come on', "stain's":'stain is', "santa's":'santa is', "rachel's":'rachel is', "joey's":'joey is', "guy's":'guy is', "diego's":'diego is', "goofin'":'goofing',
                "anyone's":'anyone is', "goin'":'going', "doin'":'doing', "robbie's":'robbie is', "monica's":'monica is', "kiddin'":'kidding', "sayin'":'saying', "c'mere":'come here',
                "phone's":'phone is', "sidney's":'sidney is', "phoebe's":'phoebe is', "belt's":'belt is', "so's":'so is', "d'y'see":'do you see', "nobody's":'nobody is',
                "people'll":'people will', "'cha":'have you', "mcdowell's":'mcdowell is', "makin'":'making', "waitin'":'waiting', "yellin'":'yelling', "comin'":'coming',
                "'scuse":'excuse',"everything's":'everything is', "y'see":'you see', "ronni's":'ronni is', "d'you":'do you', "tryin'":'trying', "nothin'":'nothing',
                "translation's":'translation is',"fixin's":'fixins', "somethin'":'something', "thinkin'":'thinking', "roger's":'roger is', "outta":'out of', "wanna":'want to',
                "gonna":'have got to', "kinda":'kind of', "sorta":'sort of', "dunno":'do not know' }

In [None]:
# 불용어
STOPWORDS = {'i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself',
             'it',"it's",'its','itself','they','them','their','theirs','themselves','this','that',"that'll",'these','those','am','is','are','was','were','be','been','being',
             'have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about',
             'against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further',
             'then','once','here','there','all','any','both','each','few','more','other','some','such','nor','not','own','same','so','than','s','t',
             'can','will','now','d','ll','m','o','re','ve','y','ain','could','ma','might','shan',"shan't",'would'}

In [None]:
# 철자 확인
!pip install autocorrect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[?25l[K     |▌                               | 10 kB 33.0 MB/s eta 0:00:01[K     |█                               | 20 kB 42.0 MB/s eta 0:00:01[K     |█▋                              | 30 kB 51.5 MB/s eta 0:00:01[K     |██                              | 40 kB 28.3 MB/s eta 0:00:01[K     |██▋                             | 51 kB 32.2 MB/s eta 0:00:01[K     |███▏                            | 61 kB 36.5 MB/s eta 0:00:01[K     |███▊                            | 71 kB 26.7 MB/s eta 0:00:01[K     |████▏                           | 81 kB 28.4 MB/s eta 0:00:01[K     |████▊                           | 92 kB 30.7 MB/s eta 0:00:01[K     |█████▎                          | 102 kB 29.8 MB/s eta 0:00:01[K     |█████▉                          | 112 kB 29.8 MB/s eta 0:00:01[K     |██████▎                        

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
import re
from autocorrect import Speller # 오타 수정
from nltk.corpus import wordnet, stopwords # 원형 복원, 불용어
from nltk.stem import WordNetLemmatizer # 원형 복원

spell = Speller()

def text_cleaning(text):

    # 소문자 변환
    no_capitals = str(text).lower()

    # 약어 정규화
    no_specials = no_capitals.replace("’", "'").replace("‘", "'")
    abbre = ' '.join([contractions[t] if t in contractions else t for t in no_specials.split(" ")])
    
    # 소유격 제거. Ex) roland's -> roland
    abbre = re.sub(r"'s\b","",abbre)

    # 영어 외 문자(숫자, 특수문자 등) 공백으로 변환
    only_english = re.sub("[^a-zA-Z]", " ", abbre) 
    
    # 철자 수정
    #spell = Speller()
    no_typo = spell(only_english)
    no_typo = re.sub('[m]{2,}', 'mm', no_typo) # m이 3개 이상이면 2개로 변경. Ex) ummmmmmm yeah -> umm yeah

    # 원형 복원
    lemmatizer = nltk.stem.WordNetLemmatizer()
    wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
    lemmatizer_words = " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in nltk.pos_tag(no_typo.split())])

    # 불용어 제거
    # no_stops = " ".join([word for word in str(lemmatizer_words).split() if word not in STOPWORDS]) # 229개 제거됨
    # no_stops = " ".join([word for word in str(lemmatizer_words).split() if word not in set(stopwords.words('english'))]) # 602개 제거됨

    # return no_stops
    return lemmatizer_words

In [None]:
data_all['Utterance'] = data_all['Utterance'].map(lambda x: text_cleaning(x))
data_all.head(100)

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target,Job,?,!
0,TRAIN_0000,also i be the point person on my company trans...,Chandler,0,4.0,0,0,0
1,TRAIN_0001,you must have have your hand full,The Interviewer,0,4.0,1,0,0
2,TRAIN_0002,that i do that i do,Chandler,0,4.0,0,0,0
3,TRAIN_0003,so let u talk a little bit about your duty,The Interviewer,0,4.0,1,0,0
4,TRAIN_0004,my duty all right,Chandler,0,6.0,0,1,0
...,...,...,...,...,...,...,...,...
95,TRAIN_0095,oh and in case you be wonder those be my leg o...,Jade,9,4.0,0,0,0
96,TRAIN_0096,can you hold on a moment i have another call i...,Chandler,9,3.0,0,1,0
97,TRAIN_0097,i know,Ross,9,4.0,0,0,0
98,TRAIN_0098,i be back,Chandler,9,4.0,0,0,0


## Modeling

In [None]:
# train, val, test 데이터 세트 생성
X_train = data_all[:data_all[data_all['Dialogue_ID']==831].index.max()+1].drop(['Target'],axis=1) # 831번 Dialogue까지
y_train = data_all[:data_all[data_all['Dialogue_ID']==831].index.max()+1]['Target'].astype('int')

X_val = data_all[data_all[data_all['Dialogue_ID']==831].index.max()+1:
                   data_all[data_all['Dialogue_ID']==1038].index.max()+1].drop(['Target'], axis=1) # 832~1038번 Dialogue까지
y_val = data_all[data_all[data_all['Dialogue_ID']==831].index.max()+1:
                   data_all[data_all['Dialogue_ID']==1038].index.max()+1]['Target'].astype('int')

X_test = data_all[data_all[data_all['Dialogue_ID']==1038].index.max()+1:].drop(['Target'],axis=1)
y_test = data_all[data_all[data_all['Dialogue_ID']==1038].index.max()+1:]['Target']

### LogisticRegression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# CountVectorizer 객체를 cnt_vect로, LogisticRegression 객체를 lr_clf로 생성하는 Pipeline 생성
pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())])

pipeline.fit(X_train['Utterance'], y_train)
pred = pipeline.predict(X_val['Utterance'])
print('예측 정확도는 {0:.4f}'.format(accuracy_score(y_val,pred)))

예측 정확도는 0.5096


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TfidfVectorizer 객체를 tfidf_vect로, LogisticRegression 객체를 lr_clf로 생성하는 Pipeline 생성
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())])

pipeline.fit(X_train['Utterance'], y_train)
pred = pipeline.predict(X_val['Utterance'])
print('예측 정확도는 {0:.4f}'.format(accuracy_score(y_val,pred)))

예측 정확도는 0.5101


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())])

params = {'tfidf_vect__ngram_range': [(1,1), (1,2), (1,3)],
          'tfidf_vect__max_df': [100, 300, 700],
          'tfidf_vect__max_features': [300, 500, 1000, None],
          'lr_clf__C': [1, 5, 10]
}

grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=0)
grid_cv_pipe.fit(X_train['Utterance'], y_train)
print(grid_cv_pipe.best_params_, grid_cv_pipe.best_score_)
pred = grid_cv_pipe.predict(X_val['Utterance'])
print('예측 정확도는 {0:.4f}'.format(accuracy_score(y_val,pred)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

{'lr_clf__C': 1, 'tfidf_vect__max_df': 700, 'tfidf_vect__max_features': 500, 'tfidf_vect__ngram_range': (1, 1)} 0.520616224944548
예측 정확도는 0.5082


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### SVM

In [None]:
from sklearn.svm import SVC

# TfidfVectorizer 객체를 tfidf_vect로, SVC 객체를 svm으로 생성하는 Pipeline 생성
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(ngram_range=(1,2))),
    ('svm', SVC())])

pipeline.fit(X_train['Utterance'], y_train)
pred = pipeline.predict(X_val['Utterance'])
print('예측 정확도는 {0:.4f}'.format(accuracy_score(y_val,pred)))

예측 정확도는 0.5087


### BERT

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 20.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 38.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 66.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
# import
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import random
import os

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from transformers import BertModel
from torch.optim import Adam

import matplotlib as mpl
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
CFG = {
    'EPOCHS': 3,
    'LEARNING_RATE':1e-5,
    'BATCH_SIZE':8,
    'SEED':41
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
traindata = pd.concat([X_train,y_train],axis=1).reset_index(drop=True)
traindata.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Job,?,!,Target
0,TRAIN_0000,also i be the point person on my company trans...,Chandler,0,0,0,0,4
1,TRAIN_0001,you must have have your hand full,The Interviewer,0,1,0,0,4
2,TRAIN_0002,that i do that i do,Chandler,0,0,0,0,4
3,TRAIN_0003,so let u talk a little bit about your duty,The Interviewer,0,1,0,0,4
4,TRAIN_0004,my duty all right,Chandler,0,0,1,0,6


In [None]:
validdata = pd.concat([X_val,y_val],axis=1).reset_index(drop=True)
validdata.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Job,?,!,Target
0,TRAIN_7858,hey,Ross,832,0,0,1,0
1,TRAIN_7859,i be sorry i be late do i miss anything,Ross,832,0,1,0,4
2,TRAIN_7860,joey stuff,Phoebe,832,0,0,0,4
3,TRAIN_7861,your personal best ross take an,Ross,832,0,1,1,6
4,TRAIN_7862,where be you,Phoebe,832,0,1,0,4


In [None]:
tokenizers = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
class CustomDataset(Dataset):
  
    def __init__(self, data, mode = "train"):
        self.dataset = data
        self.tokenizer = tokenizers
        self.mode = mode
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        text = self.dataset['Utterance'][idx]
        inputs = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'][0]
        token_type_ids = inputs['token_type_ids'][0]
        attention_mask = inputs['attention_mask'][0]
    
        if self.mode == "train":
            y = self.dataset['Target'][idx]
            return input_ids, token_type_ids, attention_mask, y
        else:
            return input_ids, token_type_ids, attention_mask

In [None]:
train = CustomDataset(traindata, mode = "train")
valid = CustomDataset(validdata, mode = "train")

train_dataloader = torch.utils.data.DataLoader(train, batch_size= CFG['BATCH_SIZE'], shuffle=True)
val_dataloader = torch.utils.data.DataLoader(valid, batch_size= CFG['BATCH_SIZE'], shuffle=False)

In [None]:
class BaseModel(nn.Module):

    def __init__(self, dropout=0.5, num_classes=len(le.classes_)):

        super(BaseModel, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
def train(model, optimizer, train_loader, test_loader, device):

    model.to(device)

    criterion = nn.CrossEntropyLoss().to(device)

    best_score = 0
    best_model = "None"
    for epoch_num in range(CFG["EPOCHS"]):

        model.train()
        train_loss = []
        for input_ids, token_type_ids, attention_mask, train_label in tqdm(train_loader):
            optimizer.zero_grad()

            train_label = train_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)     
    
            batch_loss = criterion(output, train_label.long()) 
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            optimizer.step()

        val_loss, val_score = validation(model, criterion, test_loader, device)
        print(f'Epoch [{epoch_num}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')

        if best_score < val_score:
            best_model = model
            best_score = val_score
        
    return best_model      

In [None]:
def competition_metric(true, pred):
    return f1_score(true, pred, average="macro")

def validation(model, criterion, test_loader, device):
    model.eval()

    val_loss = []
    model_preds = []
    true_labels = []  
    with torch.no_grad():
        for input_ids, token_type_ids, attention_mask, valid_label in tqdm(test_loader):
            valid_label = valid_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)
    
            batch_loss = criterion(output, valid_label.long()) 
            val_loss.append(batch_loss.item())      
            
            model_preds += output.argmax(1).detach().cpu().numpy().tolist()
            true_labels += valid_label.detach().cpu().numpy().tolist()
        val_f1 = competition_metric(true_labels, model_preds)
    return val_loss, val_f1    

In [None]:
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])

infer_model = train(model, optimizer, train_dataloader, val_dataloader, device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/983 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

Epoch [0], Train Loss : [1.47550] Val Loss : [1.33694] Val F1 Score : [0.27211]


  0%|          | 0/983 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

Epoch [1], Train Loss : [1.25035] Val Loss : [1.31764] Val F1 Score : [0.29884]


  0%|          | 0/983 [00:00<?, ?it/s]

  0%|          | 0/267 [00:00<?, ?it/s]

Epoch [2], Train Loss : [1.08876] Val Loss : [1.35607] Val F1 Score : [0.30010]


In [None]:
testdata = X_test.reset_index(drop=True)

In [None]:
testdata

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Job,?,!
0,TEST_0000,why do all the coffee cup have figure below,Mark,1039,0,1,0
1,TEST_0001,oh it be so monica can follow of this way if o...,Rachell,1039,0,1,0
2,TEST_0002,you know what,Rachell,1039,0,1,0
3,TEST_0003,come on medium you can do it,Joeyy,1040,0,0,0
4,TEST_0004,to push,Joeyy,1040,0,0,1
...,...,...,...,...,...,...,...
2605,TEST_2605,yeah i mean go ross no one will even notice it...,Rachell,1318,0,0,1
2606,TEST_2606,they do not listen to me,Rossi,1318,0,1,0
2607,TEST_2607,of course they listen to you everyone listen t...,Rachell,1318,0,0,1
2608,TEST_2608,monica do you really think i should try this t...,Rossi,1318,0,1,0


In [None]:
test = CustomDataset(testdata, mode = "test")
test_dataloader = torch.utils.data.DataLoader(test, batch_size= CFG['BATCH_SIZE'], shuffle=False)

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    
    test_predict = []
    for input_ids, token_type_ids, attention_mask in tqdm(test_loader):
        input_id = input_ids.to(device)
        mask = attention_mask.to(device)
        y_pred = model(input_id, mask)
        test_predict += y_pred.argmax(1).detach().cpu().numpy().tolist()
    print('Done.')
    return test_predict

In [None]:
preds = inference(infer_model, test_dataloader, device)

  0%|          | 0/327 [00:00<?, ?it/s]

Done.


In [None]:
preds = le.inverse_transform(preds) 

In [None]:
submit['Target'] = preds
submit['Target'].value_counts()

neutral     1651
joy          327
surprise     279
anger        277
sadness       76
Name: Target, dtype: int64

In [None]:
submit.to_csv('불용어X.csv', index=False)

In [None]:
BERT_preds = preds.copy()

### Ensemble (Hard Voting)

#### BERT

In [None]:
preds['BERT'] = BERT_preds
preds

Unnamed: 0,ID,BERT
0,TEST_0000,neutral
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,neutral
...,...,...
2605,TEST_2605,neutral
2606,TEST_2606,neutral
2607,TEST_2607,joy
2608,TEST_2608,neutral


#### LGBM

In [None]:
# train, val, test 데이터 세트 생성
X_train = data_all[:data_all[data_all['Dialogue_ID']==1038].index.max()+1].drop(['Target'],axis=1) # 831번 Dialogue까지
y_train = data_all[:data_all[data_all['Dialogue_ID']==1038].index.max()+1]['Target']

X_test = data_all[data_all[data_all['Dialogue_ID']==1038].index.max()+1:].drop(['Target'],axis=1)
y_test = data_all[data_all[data_all['Dialogue_ID']==1038].index.max()+1:]['Target']

In [None]:
# 텍스트 벡터화
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)
tfidf_vect.fit(X_train['Utterance'])
X_train_tfidt_vect = tfidf_vect.transform(X_train['Utterance'])
X_test_tfidt_vect = tfidf_vect.transform(X_test['Utterance'])

In [None]:
from sklearn.preprocessing import LabelBinarizer

# brand_name, item_condition_id, shipping 각 피처들을 희소 행렬 원-핫 인코딩 변환
lb_speaker = LabelBinarizer(sparse_output=True)
lb_speaker.fit(X_train['Speaker'])
X_speaker = lb_speaker.transform(X_train['Speaker'])
Xt_speaker = lb_speaker.transform(X_test['Speaker'])

lb_dial = LabelBinarizer(sparse_output=True)
lb_dial.fit(X_train['Dialogue_ID'])
X_dial = lb_dial.transform(X_train['Dialogue_ID'])
Xt_dial = lb_dial.transform(X_test['Dialogue_ID'])

lb_job = LabelBinarizer(sparse_output=True)
lb_job.fit(X_train['Job'])
X_job = lb_job.fit_transform(X_train['Job'])
Xt_job = lb_job.fit_transform(X_test['Job'])

lb_qmark = LabelBinarizer(sparse_output=True)
lb_qmark.fit(X_train['?'])
X_qmark = lb_qmark.transform(X_train['?'])
Xt_qmark = lb_qmark.transform(X_test['?'])

lb_emark = LabelBinarizer(sparse_output=True)
lb_emark.fit(X_train['!'])
X_emark = lb_emark.transform(X_train['!'])
Xt_emark = lb_emark.transform(X_test['!'])

In [None]:
# 하나의 scr 희소행렬 데이터프레임으로 변환
import gc
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split, cross_val_score

sparse_matrix_list_train = (X_speaker, X_dial, X_job, X_qmark, X_emark, X_train_tfidt_vect)
sparse_matrix_list_test = (Xt_speaker, Xt_dial, Xt_job, Xt_qmark, Xt_emark, X_test_tfidt_vect)

train_matrix_csr = hstack(sparse_matrix_list_train).tocsr()
test_matrix_csr = hstack(sparse_matrix_list_test).tocsr()

In [None]:
# 레이블 원핫인코딩 데이터프레임
y_train_encoded = pd.DataFrame(y_train)
for i in range(6):
  y_train_new = y_train.apply(lambda x : 1 if x==i else 0)
  y_train_encoded[i] = y_train_new
print(y_train_encoded.head()) 
y_train_encoded.drop('Target',axis=1, inplace=True)

  Target  0  1  2  3  4  5
0      4  0  0  0  0  1  0
1      4  0  0  0  0  1  0
2      4  0  0  0  0  1  0
3      4  0  0  0  0  1  0
4      6  0  0  0  0  0  0


In [None]:
# 모델 학습 및 예측
from lightgbm import LGBMRegressor
lgbm_model = LGBMRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=156)
lgbm_preds = pd.DataFrame(np.zeros((X_test.shape[0],6)))
for i in range(6):
  lgbm_model.fit(train_matrix_csr, y_train_encoded[i])
  pred = lgbm_model.predict(test_matrix_csr)
  lgbm_preds[i] = pred

gc.collect()
lgbm_preds

Unnamed: 0,0,1,2,3,4,5
0,0.066768,0.019694,0.111896,0.050245,0.597185,0.026922
1,0.080552,0.008541,0.016512,0.072773,0.477810,0.262403
2,0.050377,0.014160,0.016659,0.050245,0.570647,0.032223
3,0.051352,0.006484,0.025538,0.089947,0.681059,0.045253
4,0.218097,0.021068,0.016659,0.334103,0.135974,0.035838
...,...,...,...,...,...,...
2605,0.321678,0.037907,0.024011,0.136459,0.276107,0.085160
2606,0.050377,0.007568,0.016069,0.019941,0.624590,0.046473
2607,0.189195,0.014476,0.016069,0.303799,0.189916,0.060009
2608,0.097340,0.021259,0.008518,0.061996,0.633380,0.203790


In [None]:
lgbm_preds['label'] = lgbm_preds.idxmax(axis=1)
lgbm_preds['label'].value_counts()

4    1760
3     609
0     193
5      42
2       5
1       1
Name: label, dtype: int64

In [None]:
lgbm_preds

Unnamed: 0,0,1,2,3,4,5,label
0,0.066768,0.019694,0.111896,0.050245,0.597185,0.026922,4
1,0.080552,0.008541,0.016512,0.072773,0.477810,0.262403,4
2,0.050377,0.014160,0.016659,0.050245,0.570647,0.032223,4
3,0.051352,0.006484,0.025538,0.089947,0.681059,0.045253,4
4,0.218097,0.021068,0.016659,0.334103,0.135974,0.035838,3
...,...,...,...,...,...,...,...
2605,0.321678,0.037907,0.024011,0.136459,0.276107,0.085160,0
2606,0.050377,0.007568,0.016069,0.019941,0.624590,0.046473,4
2607,0.189195,0.014476,0.016069,0.303799,0.189916,0.060009,3
2608,0.097340,0.021259,0.008518,0.061996,0.633380,0.203790,4


In [None]:
preds['lgbm']=le.inverse_transform(lgbm_preds['label'])

In [None]:
preds

Unnamed: 0,ID,BERT,lgbm
0,TEST_0000,neutral,neutral
1,TEST_0001,neutral,neutral
2,TEST_0002,neutral,neutral
3,TEST_0003,neutral,neutral
4,TEST_0004,neutral,joy
...,...,...,...
2605,TEST_2605,neutral,anger
2606,TEST_2606,neutral,neutral
2607,TEST_2607,joy,joy
2608,TEST_2608,neutral,neutral


#### RF

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=156)
rf_preds = pd.DataFrame(np.zeros((X_test.shape[0],6)))
for i in range(6):
  rf_model.fit(train_matrix_csr, y_train_encoded[i])
  pred = rf_model.predict(test_matrix_csr)
  rf_preds[i] = pred

rf_preds

Unnamed: 0,0,1,2,3,4,5
0,0.00,0.0,0.00,0.04,0.608,0.00
1,0.00,0.0,0.01,0.00,0.390,0.14
2,0.00,0.0,0.00,0.00,0.980,0.00
3,0.00,0.0,0.00,0.00,1.000,0.00
4,0.00,0.0,0.00,0.00,0.000,0.01
...,...,...,...,...,...,...
2605,0.43,0.0,0.00,0.13,0.040,0.00
2606,0.03,0.0,0.00,0.02,0.730,0.00
2607,0.37,0.0,0.00,0.47,0.070,0.00
2608,0.03,0.0,0.00,0.37,0.070,0.02


In [None]:
rf_preds['label'] = rf_preds.idxmax(axis=1)
rf_preds['label'].value_counts()

4    1735
0     400
3     317
5      93
1      49
2      16
Name: label, dtype: int64

In [None]:
preds['RF']=le.inverse_transform(rf_preds['label'])

In [None]:
preds

Unnamed: 0,ID,BERT,lgbm,RF
0,TEST_0000,neutral,neutral,neutral
1,TEST_0001,neutral,neutral,neutral
2,TEST_0002,neutral,neutral,neutral
3,TEST_0003,neutral,neutral,neutral
4,TEST_0004,neutral,joy,sadness
...,...,...,...,...
2605,TEST_2605,neutral,anger,anger
2606,TEST_2606,neutral,neutral,neutral
2607,TEST_2607,joy,joy,joy
2608,TEST_2608,neutral,neutral,joy


#### Hard Voting

In [None]:
preds

Unnamed: 0,ID,BERT,lgbm,RF
0,TEST_0000,neutral,neutral,neutral
1,TEST_0001,neutral,neutral,neutral
2,TEST_0002,neutral,neutral,neutral
3,TEST_0003,neutral,neutral,neutral
4,TEST_0004,neutral,joy,sadness
...,...,...,...,...
2605,TEST_2605,neutral,anger,anger
2606,TEST_2606,neutral,neutral,neutral
2607,TEST_2607,joy,joy,joy
2608,TEST_2608,neutral,neutral,joy


In [None]:
preds['mode'] = preds.drop('ID',axis=1).mode(axis=1).iloc[:,0]
preds

Unnamed: 0,ID,BERT,lgbm,RF,mode
0,TEST_0000,neutral,neutral,neutral,neutral
1,TEST_0001,neutral,neutral,neutral,neutral
2,TEST_0002,neutral,neutral,neutral,neutral
3,TEST_0003,neutral,neutral,neutral,neutral
4,TEST_0004,neutral,joy,sadness,joy
...,...,...,...,...,...
2605,TEST_2605,neutral,anger,anger,anger
2606,TEST_2606,neutral,neutral,neutral,neutral
2607,TEST_2607,joy,joy,joy,joy
2608,TEST_2608,neutral,neutral,joy,neutral


In [None]:
preds['mode'].value_counts()

neutral    1745
anger       441
joy         340
sadness      42
disgust      34
fear          8
Name: mode, dtype: int64

In [None]:
preds['mode'].isnull().sum()

0

In [None]:
submit['Target'] = preds['mode']
submit

Unnamed: 0,ID,Target
0,TEST_0000,neutral
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,joy
...,...,...
2605,TEST_2605,anger
2606,TEST_2606,neutral
2607,TEST_2607,joy
2608,TEST_2608,neutral


- score 0.2810615266

In [None]:
# 기존 전처리 없이 진행한 BERT 결과와도 보팅
preds['BERT'] = BERT2['Target']
preds['mode'] = preds.drop('ID',axis=1).mode(axis=1).iloc[:,0]
preds

Unnamed: 0,ID,BERT,lgbm,RF,mode
0,TEST_0000,neutral,neutral,neutral,neutral
1,TEST_0001,neutral,neutral,neutral,neutral
2,TEST_0002,neutral,neutral,neutral,neutral
3,TEST_0003,neutral,neutral,neutral,neutral
4,TEST_0004,anger,joy,sadness,joy
...,...,...,...,...,...
2605,TEST_2605,anger,anger,anger,anger
2606,TEST_2606,surprise,neutral,neutral,neutral
2607,TEST_2607,joy,joy,joy,joy
2608,TEST_2608,neutral,neutral,joy,neutral


In [None]:
preds['mode'].isnull().sum()

0

In [None]:
submit['Target'] = preds['mode']
submit

Unnamed: 0,ID,Target
0,TEST_0000,neutral
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,joy
...,...,...
2605,TEST_2605,anger
2606,TEST_2606,neutral
2607,TEST_2607,joy
2608,TEST_2608,neutral


- score : 0.2841460898

### BERT (제출모델, 전처리 적용하지 않음)

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 7.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 59.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 42.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


In [None]:
# import
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import random
import os

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from transformers import BertModel
from torch.optim import Adam

import matplotlib as mpl
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
CFG = {
    'EPOCHS': 3,
    'LEARNING_RATE':1e-5,
    'BATCH_SIZE':8,
    'SEED':41
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
le = LabelEncoder()
le=le.fit(train_df['Target'])
train_df['Target']=le.transform(train_df['Target'])

In [None]:
valid=train_df[train_df['Dialogue_ID'].isin([i for i in range(1016,1039)])].reset_index(drop=True)
train=train_df[~train_df['Dialogue_ID'].isin([i for i in range(1016,1039)])].reset_index(drop=True)

train_len=len(train)
val_len=len(valid)

print(train_len)
print(val_len)

9725
264


In [None]:
tokenizers = BertTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class CustomDataset(Dataset):
  
    def __init__(self, data, mode = "train"):
        self.dataset = data
        self.tokenizer = tokenizers
        self.mode = mode
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        text = self.dataset['Utterance'][idx]
        inputs = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'][0]
        token_type_ids = inputs['token_type_ids'][0]
        attention_mask = inputs['attention_mask'][0]
    
        if self.mode == "train":
            y = self.dataset['Target'][idx]
            return input_ids, token_type_ids, attention_mask, y
        else:
            return input_ids, token_type_ids, attention_mask

In [None]:
train = CustomDataset(train, mode = "train")
valid = CustomDataset(valid, mode = "train")

train_dataloader = torch.utils.data.DataLoader(train, batch_size= CFG['BATCH_SIZE'], shuffle=True)
val_dataloader = torch.utils.data.DataLoader(valid, batch_size= CFG['BATCH_SIZE'], shuffle=False)

In [None]:
class BaseModel(nn.Module):

    def __init__(self, dropout=0.5, num_classes=len(le.classes_)):

        super(BaseModel, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
def train(model, optimizer, train_loader, test_loader, device):

    model.to(device)

    criterion = nn.CrossEntropyLoss().to(device)

    best_score = 0
    best_model = "None"
    for epoch_num in range(CFG["EPOCHS"]):

        model.train()
        train_loss = []
        for input_ids, token_type_ids, attention_mask, train_label in tqdm(train_loader):
            optimizer.zero_grad()

            train_label = train_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)     
    
            batch_loss = criterion(output, train_label.long()) 
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            optimizer.step()

        val_loss, val_score = validation(model, criterion, test_loader, device)
        print(f'Epoch [{epoch_num}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')

        if best_score < val_score:
            best_model = model
            best_score = val_score
        
    return best_model          

In [None]:
def competition_metric(true, pred):
    return f1_score(true, pred, average="macro")

def validation(model, criterion, test_loader, device):
    model.eval()

    val_loss = []
    model_preds = []
    true_labels = []  
    with torch.no_grad():
        for input_ids, token_type_ids, attention_mask, valid_label in tqdm(test_loader):
            valid_label = valid_label.to(device)
            input_id = input_ids.to(device)
            mask = attention_mask.to(device)

            output = model(input_id, mask)
    
            batch_loss = criterion(output, valid_label.long()) 
            val_loss.append(batch_loss.item())      
            
            model_preds += output.argmax(1).detach().cpu().numpy().tolist()
            true_labels += valid_label.detach().cpu().numpy().tolist()
        val_f1 = competition_metric(true_labels, model_preds)
    return val_loss, val_f1    

In [None]:
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])

infer_model = train(model, optimizer, train_dataloader, val_dataloader, device)

In [None]:
test = CustomDataset(test, mode = "test")
test_dataloader = torch.utils.data.DataLoader(test, batch_size= CFG['BATCH_SIZE'], shuffle=False)

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    
    test_predict = []
    for input_ids, token_type_ids, attention_mask in tqdm(test_loader):
        input_id = input_ids.to(device)
        mask = attention_mask.to(device)
        y_pred = model(input_id, mask)
        test_predict += y_pred.argmax(1).detach().cpu().numpy().tolist()
    print('Done.')
    return test_predict

In [None]:
preds = inference(infer_model, test_dataloader, device)

In [None]:
preds = le.inverse_transform(preds) 

In [None]:
submission['Target'] = preds
submission.head()

In [None]:
submission.to_csv('./submit.csv', index=False)