In [1]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from tqdm import tqdm

import numpy as np
import pandas as pd
import csv

# NBC수행

1. 코퍼스 가져오기

In [2]:
corpus = pd.read_csv('C:/Users/user/Desktop/globo/ngram_label.csv')
corpus = corpus.dropna()
corpus

Unnamed: 0,date,label,ngram
0,2018-06-01,-1,"nã,páti,descans,local,pass,noit,famíl,viag,lis..."
1,2018-06-01,-1,"o,banc,central,mantev,atuaçã,merc,câmbi,vend,1..."
2,2018-06-01,-1,"indic,aind,investig,trat,pagament,propin,minis..."
3,2018-06-01,-1,"a,empres,possu,hoj,14,pesso,—entr,vice-preside..."
4,2018-06-01,-1,"vei,bendin,banc,brasil,onde,cuid,áre,financeir..."
...,...,...,...
8524,2019-12-31,0,"ele,indic,viaj,pequim,dat,posterior,fas,dois,d..."
8525,2019-12-31,0,"em,cas,licit,empres,vencedor,oferec,uniã,maior..."
8526,2019-12-31,0,"trump,anunc,primeir,vez,plan,pact,comercial,in..."
8527,2019-12-31,0,"toffol,conced,limin,estar,plantã,durant,recess..."


In [3]:
# 토크나이저 함수
def my_tokenizer(x):
    return x.split(",")

2. 데이터 파이프 라인

In [4]:
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), min_df = 15, tokenizer = my_tokenizer)),
                      ('clf', MultinomialNB(alpha=0.001))])
vect = text_clf.named_steps['vect']
clf = text_clf.named_steps['clf']

In [5]:
# 배깅 30회
accuracy = np.zeros(30)
posterior_list = []

for i in tqdm(range(30)):
    X_train, X_test, y_train, y_test = train_test_split(corpus['ngram'], corpus['label'], random_state = i, train_size = 0.9, shuffle=True)
    text_clf.fit(X_train, y_train)
    posterior_list.append(np.vstack([np.array(vect.get_feature_names()), np.exp(clf.feature_log_prob_)]))
    pred = text_clf.predict(X_test)
    accuracy[i] = np.sum(pred == y_test)

100%|██████████| 30/30 [00:56<00:00,  1.89s/it]


In [6]:
clf.classes_

array([-1,  0,  1], dtype=int64)

In [7]:
pd.DataFrame(posterior_list)

Unnamed: 0,0
0,"[[', 's, ., .a, .a;companh, .a;empres, .a;expe..."
1,"[[', 's, ., .a, .a;companh, .a;empres, .a;expe..."
2,"[[', 's, ., .a, .a;companh, .a;empres, .a;expe..."
3,"[[', 's, ., .a, .a;companh, .a;empres, .a;expe..."
4,"[[', 's, ., .a, .a;companh, .a;empres, .a;expe..."
5,"[[', 's, ., .a, .a;companh, .a;empres, .a;expe..."
6,"[[', 's, ., .a, .a;companh, .a;empres, .a;expe..."
7,"[[', 's, ., .a, .a;companh, .a;empres, .a;expe..."
8,"[[', 's, ., .a, .a;companh, .a;empres, .a;expe..."
9,"[[', 's, ., .a, .a;companh, .a;empres, .a;expe..."


In [8]:
cv = CountVectorizer(ngram_range=(1,1), min_df = 15, tokenizer = my_tokenizer)

In [9]:
X = cv.fit_transform(corpus['ngram'])

In [10]:
ngram_list = cv.get_feature_names()
len(ngram_list)

2277

In [11]:
polarity_scores = np.zeros((30, len(ngram_list)))

for i, itr in tqdm(enumerate(posterior_list)):
    for idx, n_gram in enumerate(itr[0]):
        tmp_n = ngram_list.index(n_gram)
        p_score = float(itr[3][idx])/float(itr[1][idx])
        polarity_scores[i][tmp_n] = p_score
        
polarity_scores.shape

30it [00:02, 13.34it/s]


(30, 2277)

In [12]:
for i, v in enumerate(polarity_scores):
    for j, w in enumerate(v):
         if polarity_scores[i][j] == 0:
             polarity_scores[i][j] = None

In [13]:
df_p_scores = pd.DataFrame(polarity_scores)
df_p_scores = df_p_scores.fillna(df_p_scores.mean())

In [14]:
df_p_scores.T.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
dtype: int64

In [15]:
avg_polarity_scores = list(df_p_scores.mean())

In [19]:
ps = pd.DataFrame(avg_polarity_scores, index=ngram_list)
ps

Unnamed: 0,0
',2.989716
's,0.920036
.,0.929738
.a,2.146970
.a;companh,1.226742
...,...
últim;anos,0.773918
últim;mes,0.550130
únic,0.800181
–,1.103952


In [20]:
ps = ps[ps[0].apply(lambda x: x > 1.2 or x < 0.8)]
ps

Unnamed: 0,0
',2.989716
.a,2.146970
.a;companh,1.226742
.a;expect,1.247851
.a;petrobr,1.779007
...,...
óle,0.685254
últim,0.753788
últim;anos,0.773918
últim;mes,0.550130


In [21]:
ps.to_csv('C:/Users/user/Desktop/globo/polarity_score_new.csv', encoding = 'utf-8')

In [22]:
ps[ps[0]<0.8]

Unnamed: 0,0
.em,0.405271
.na;seman,0.392651
.nest;ano,0.291913
.no;acumul,0.676243
.no;cas,0.350394
...,...
índic;nacional,0.788495
óle,0.685254
últim,0.753788
últim;anos,0.773918


In [24]:
ps[ps[0]>1.2]

Unnamed: 0,0
',2.989716
.a,2.146970
.a;companh,1.226742
.a;expect,1.247851
.a;petrobr,1.779007
...,...
águ,1.662491
índic;nacional;cust,1.428137
íntegr,1.239145
íntegr;not,1.922887
