In [1]:
import pandas as pd
import time

In [2]:
import logging
import gensim
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import nltk.stem
import string
import re
import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
import joblib



In [3]:
#标点符号：
remove = str.maketrans('','',string.punctuation)
stemwords = nltk.stem.SnowballStemmer('english')
def token(x):#字符清洗
    remove = str.maketrans('','',string.punctuation)
    x = x.lower()
    x = re.sub('[\d]','',x)
    x = x.translate(remove)
    cutwords = word_tokenize(x)
    without_stopwords = [w for w in cutwords if not w in stopwords.words('english')]
    cleaned_text = [stemwords.stem(w) for w in without_stopwords]
    return cleaned_text

In [4]:
def get_W2V(data,name = 'W2V'):
    model = Word2Vec(vector_size = 256,min_count = 10)
    model.build_vocab(data)
    model.train(data,total_examples = model.corpus_count,epochs =2)
    model.save(name+'.model')
    return model

def get_sentence_vector(sentence_list,model):
    n = len(sentence_list)
    vector = np.zeros(256)
    for i in sentence_list:
        if i in model.wv.index_to_key:
            vector = vector+model.wv[i]
    vector = vector/n
    
    return vector

In [5]:
def listStore(listname,filename):
    a=np.array(listname)
    np.save(filename+'.npy',a) 
    print('succeed!')
    return

def readlist(path):
    a = np.load(path)
    a = a.tolist()
    return a

In [6]:
def inputclean(X,Y):#将数据中的空值和无穷大值剔除
    count = 0
    index = []
    n = len(X)
    for i in range(n):
        if np.isfinite(X[i]).all() == False or np.isnan(X[i]).all():
            index.append(i)
            count = count+1
    for i in reversed(index):
        del X[i]
        del Y[i]
    print(f'{count} sets of data have been dropped.')
    return X,Y  

In [7]:

def svm_train(train_vecs,y_train,test_vecs,y_test,name = 'model1'):
    clf=SVC(kernel='rbf',verbose=True)
    start = time.time()
    clf.fit(train_vecs,y_train)
    joblib.dump(clf, name+'.pkl')
    duration = time.time()-start
    print(f'duration: {duration}')
    print(clf.score(test_vecs,y_test))
    return clf

In [8]:
def knn_train(train_vecs,y_train,test_vecs,y_test,name = 'knn'):
    knn = KNN(n_neighbors = 6)
    start = time.time()
    knn.fit(train_vecs,y_train)
    duration = time.time()-start
    
    print(f"training completed! duration:{duration}")
    
    joblib.dump(knn,name+'.pkl')
    
    print("model saved!")
    
    #pre = knn.predict(test_vecs)
    #print(f'{accuracy_score(y_test,pre)}  duration:{duration}')
    return knn

In [9]:
#对单个句子进行情感判断    
def predict(string,wvmodel,premodel):
    words=token(string)
    words_vecs=get_sentence_vector(words,wvmodel)

    result=premodel.predict([words_vecs])
    
    if int(result[0])==1:
        print(string,' toxic')
    else:
        print(string,' non-toxic')

In [12]:
df_train = pd.read_csv("train_data.csv")
df_train['cleaned'] = df_train['comment_text'].apply(lambda x:token(x))
df_train = df_train[['cleaned','toxic']]
df_train.to_csv('train_cleaned.csv')
x_train = df_train['cleaned']
y_train = np.array(df_train['toxic'])

In [13]:
df_test = pd.read_csv("test_data.csv")
df_test['cleaned'] = df_test['comment_text'].apply(lambda x:token(x))
df_test = df_test[['cleaned','toxic']]
df_test.to_csv('test_cleaned.csv')
x_test = df_test['cleaned']
y_test = np.array(df_test['toxic'])

In [15]:
#model = get_W2V(X)
model = Word2Vec.load('W2V.model')

In [16]:
trVec = [get_sentence_vector(i,model) for i in x_train] #训练向量
teVec = [get_sentence_vector(i,model) for i in x_test]  #测试向量

  


In [17]:
listStore(trVec,'trVec')
listStore(teVec,'teVec')

succeed!
succeed!


In [18]:
listStore(y_train,'ytr')
listStore(y_test,'yte')

succeed!
succeed!


## 从这里开始跑！

In [10]:
Y_tr = readlist('ytr.npy')
X_tr = readlist('trVec.npy')
X_te = readlist('teVec.npy')
Y_te = readlist('yte.npy')
X_tr,Y_tr = inputclean(X_tr,Y_tr)
X_te,Y_te = inputclean(X_te,Y_te)

435 sets of data have been dropped.
7 sets of data have been dropped.


In [31]:
x_tr = X_tr[:10000]
y_tr = Y_tr[:10000]
x_te = X_te[:300]
y_te = Y_te[:300]

In [32]:
clf = knn_train(X_tr,Y_tr,X_te,Y_te)

training completed! duration:2.188566207885742
model saved!


In [42]:
svm = svm_train(x_tr,y_tr,x_te,y_te)

[LibSVM]duration: 6.59570837020874
0.98


In [None]:
pre = clf.predict(X_te)

In [11]:
model = joblib.load("knn.pkl")
wvmodel = Word2Vec.load('W2V.model')
test = 'you suck ,motherfucker'
vec = get_sentence_vector(test,wvmodel)
predict(test,wvmodel,model)

you suck ,motherfucker  toxic


In [22]:
test = "sorry but you are still an asshole,you fuck"

In [23]:
predict(test,wvmodel,model)

sorry but you are still an asshole,you fuck  toxic


In [12]:
def getCM(model,X_te,Y_te):#计算混淆矩阵
    n00 = 0
    n01 = 0
    n10 = 0
    n11 = 0
    count = 0
    n = len(X_te)
    start = time.time()
    for idx in range(n):
        pre = model.predict([X_te[idx]])[0]
        true = Y_te[idx]
        if true == 0:
            if pre ==0 :
                n00 = n00+1
            else:
                n01 = n01+1
        else:
            if pre ==0:
                n10 = n10+1
            else:
                n11 = n11+1
        if idx<10:
            print('\r'+'■'*((idx*30)//n)+str(round(idx*100/n,2))+'%',end = '')
        elif idx == 10:
            timecost = time.time()-start
            eta = (n-idx)/10*timecost
            print('\r'+'■'*((idx*30)//n)+str(round(idx*100/n,2))+'%'+' eta:'+str(round(eta,1))+'s',end = '')           
        else :
            eta = (n-idx)/10*timecost
            print('\r'+'■'*((idx*30)//n)+str(round(idx*100/n,2))+'%'+' eta:'+str(round(eta,1))+'s',end = '')
    print('\r'+'■'*30+'100%'+'completed!')
    return n00,n01,n10,n11 

In [14]:
n00,n01,n10,n11 = getCM(model,X_te,Y_te)

■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■100%completed!s


In [15]:
n = n00+n01+n10+n11
print(f'accuracy:{(n00+n11)/n}')
print(f'presicion:{n11/(n01+n11)}')
print(f'recall:{n11/(n10+n11)}')

accuracy:0.8574660633484162
presicion:0.9513422818791947
recall:0.4491089108910891


In [21]:
n00

59915

In [29]:
n10

3457

In [30]:
n01

729

In [31]:
n11

2939

In [25]:
n11

0

## ensemble

EasyEnsemble(决策树+adaboost)

In [33]:
from imblearn.ensemble import EasyEnsembleClassifier as EEC
knnmodel = joblib.load('knn.pkl')
ee = EEC(random_state=0,n_estimators = 10,base_estimator = knnmodel,warm_start = True)
ee = ee.fit(X_tr, Y_tr)

In [35]:
n00,n01,n10,n11 = getCM(ee,X_te,Y_te)

■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■100%completed!ss


In [36]:
n = n00+n01+n10+n11
print(f'accuracy:{(n00+n11)/n}')
print(f'presicion:{n11/(n01+n11)}')
print(f'recall:{n11/(n10+n11)}')

accuracy:0.8589415699390124
presicion:0.6677945247616118
recall:0.8598019801980198


In [37]:
n00

6561

In [38]:
n01

1080

In [39]:
n10

354

In [40]:
n11

2171

RUSboost

In [12]:
from imblearn.ensemble import RUSBoostClassifier as RUS
rus = RUS(random_state = 0)
rus = rus.fit(X_tr,Y_tr)

In [16]:
n00,n01,n10,n11 = getCM(rus,X_te,Y_te)

■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■100%completed!


In [17]:
n = n00+n01+n10+n11
print(f'accuracy:{(n00+n11)/n}')
print(f'presicion:{n11/(n01+n11)}')
print(f'recall:{n11/(n10+n11)}')

accuracy:0.8719260279362581
presicion:0.6993805021193349
recall:0.8495049504950495


balancedbagging

In [18]:
from imblearn.ensemble import BalancedBaggingClassifier as BBC
bbc = BBC(warm_start = True)
bbc = bbc.fit(X_tr,Y_tr)

In [19]:
n00,n01,n10,n11 = getCM(bbc,X_te,Y_te)

■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■100%completed!


In [20]:
n = n00+n01+n10+n11
print(f'accuracy:{(n00+n11)/n}')
print(f'presicion:{n11/(n01+n11)}')
print(f'recall:{n11/(n10+n11)}')

accuracy:0.8819594727523117
presicion:0.7694184627897519
recall:0.7493069306930693


BalancedrandomForest

In [25]:
from imblearn.ensemble import BalancedRandomForestClassifier as BRF
brf = BRF(criterion = 'entropy')
brf = brf.fit(X_tr,Y_tr)

In [26]:
n00,n01,n10,n11 = getCM(brf,X_te,Y_te)

■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■100%completed!


In [27]:
n = n00+n01+n10+n11
print(f'accuracy:{(n00+n11)/n}')
print(f'presicion:{n11/(n01+n11)}')
print(f'recall:{n11/(n10+n11)}')

accuracy:0.8791068266771591
presicion:0.7108002602472349
recall:0.8653465346534653


In [39]:
x =  'a pair of jewhating weiner nazi schmucks'
temp = get_sentence_vector(x,wvmodel)

In [40]:
brf.predict([temp])

array([0])