In [21]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from tqdm import tqdm

import numpy as np
import pandas as pd
import csv

# NBC수행

1. 코퍼스 가져오기

In [22]:
corpus = pd.read_csv('C:/Users/user/Desktop/globo/ngram_label.csv')
corpus = corpus.dropna()
corpus

Unnamed: 0,date,label,ngram
0,2018-06-01,-1,"áudi,convoc,caminhoneir,nov,ato,víd,líd,movime..."
1,2018-06-01,-1,"açõ,petrobr,caem,quas,15,estatal,perd,r,40,bi,..."
2,2018-06-01,-1,"após,oper,pf,ministéri,suspend,registr,sindic,..."
3,2018-06-01,-1,"após,ped,demissã,parent,pod,ser,nov,presidente..."
4,2018-06-01,-1,"petrobr,confirm,ivan,monteir,nov,president,int..."
...,...,...,...
8524,2019-12-31,0,"trump,anunc,assin,acord,comercial,parcial,chin..."
8525,2019-12-31,0,"govern,transfer,r,11,7,bilhõ,estad,municípi,;,..."
8526,2019-12-31,0,"trump,diz,acord,comercial,eua-chin,assin,15,ja..."
8527,2019-12-31,0,"toffol,suspend,resolu,reduz,valor,dpvat,2020pa..."


In [23]:
# 토크나이저 함수
def my_tokenizer(x):
    return x.split(",")

2. 데이터 파이프 라인

In [24]:
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), min_df = 15, tokenizer = my_tokenizer)),
                      ('clf', MultinomialNB(alpha=0.001))])
vect = text_clf.named_steps['vect']
clf = text_clf.named_steps['clf']

In [25]:
# 배깅 30회
accuracy = np.zeros(30)
posterior_list = []

for i in tqdm(range(30)):
    X_train, X_test, y_train, y_test = train_test_split(corpus['ngram'], corpus['label'], random_state = i, train_size = 0.9, shuffle=True)
    text_clf.fit(X_train, y_train)
    posterior_list.append(np.vstack([np.array(vect.get_feature_names()), np.exp(clf.feature_log_prob_)]))
    pred = text_clf.predict(X_test)
    accuracy[i] = np.sum(pred == y_test)

100%|██████████| 30/30 [14:31<00:00, 29.04s/it]


In [26]:
clf.classes_

array([-1,  0,  1], dtype=int64)

In [27]:
pd.DataFrame(posterior_list)

Unnamed: 0,0
0,"[[#, ', ';a, ';afirm, ';diss, ';diz, ';o, 'nã,..."
1,"[[#, ', ';a, ';afirm, ';diss, ';diz, ';o, 'nã,..."
2,"[[#, ', ';a, ';afirm, ';diss, ';diz, ';o, 'nã,..."
3,"[[#, ', ';a, ';afirm, ';diss, ';diz, ';o, 'nã,..."
4,"[[#, ', ';a, ';afirm, ';diss, ';diz, ';o, 'nã,..."
5,"[[#, ', ';a, ';afirm, ';diss, ';diz, ';o, 'nã,..."
6,"[[#, ', ';a, ';afirm, ';diss, ';diz, ';o, 'nã,..."
7,"[[#, ', ';a, ';afirm, ';diss, ';diz, ';o, 'nã,..."
8,"[[#, ', ';a, ';afirm, ';diss, ';diz, ';o, 'nã,..."
9,"[[#, ', ';a, ';afirm, ';diss, ';diz, ';o, 'nã,..."


In [28]:
cv = CountVectorizer(ngram_range=(1,1), min_df = 15, tokenizer = my_tokenizer)

In [29]:
X = cv.fit_transform(corpus['ngram'])

In [30]:
ngram_list = cv.get_feature_names()
len(ngram_list)

28764

In [31]:
polarity_scores = np.zeros((30, len(ngram_list)))

for i, itr in tqdm(enumerate(posterior_list)):
    for idx, n_gram in enumerate(itr[0]):
        tmp_n = ngram_list.index(n_gram)
        p_score = float(itr[3][idx])/float(itr[1][idx])
        polarity_scores[i][tmp_n] = p_score
        
polarity_scores.shape

30it [14:20, 28.67s/it]


(30, 28764)

In [32]:
for i, v in enumerate(polarity_scores):
    for j, w in enumerate(v):
         if polarity_scores[i][j] == 0:
             polarity_scores[i][j] = None

In [33]:
df_p_scores = pd.DataFrame(polarity_scores)
df_p_scores = df_p_scores.fillna(df_p_scores.mean())

In [34]:
df_p_scores.T.isnull().sum()

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
dtype: int64

In [35]:
avg_polarity_scores = list(df_p_scores.mean())

In [36]:
ps = pd.DataFrame(avg_polarity_scores, index=ngram_list)
ps

Unnamed: 0,0
#,1.292694
',0.943288
';a,2.227960
';afirm,0.334053
';diss,2.203546
...,...
—um,1.010640
‘,0.775251
’,1.283760
€,0.679467


In [37]:
ps = ps[ps[0].apply(lambda x: x > 1.2 or x < 0.8)]
ps

Unnamed: 0,0
#,1.292694
';a,2.227960
';afirm,0.334053
';diss,2.203546
';diz,0.681499
...,...
—os,0.442707
—r,0.715117
‘,0.775251
’,1.283760


In [38]:
ps.to_csv('C:/Users/user/Desktop/globo/polarity_score_new.csv', encoding = 'utf-8')

In [39]:
ps[ps[0]<0.8]

Unnamed: 0,0
';afirm,0.334053
';diz,0.681499
';o,0.580788
'nã,0.141745
-2,0.585870
...,...
—no,0.747049
—os,0.442707
—r,0.715117
‘,0.775251


In [40]:
ps[ps[0]>1.2]

Unnamed: 0,0
#,1.292694
';a,2.227960
';diss,2.203546
*,1.695036
+,2.141892
...,...
—de,1.345460
—el,2.603247
—em,1.202985
—nã,2.440193
