In [None]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense,Dropout,Activation
from tensorflow.keras.layers import Conv1D ,GlobalMaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.datasets import imdb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
# 绘制模型表现图
def performance(y_true , predict , color = "g" , ann = True):
    acc = accuracy_score(y_true , predict[:] > 0.5)
    auc = roc_auc_score(y_true , predict[:])
    fpr , tpr , thr = roc_curve(y_true , predict[:])
    plt.figure()
    plt.plot(fpr , tpr )

# 读取数据

In [None]:
df_train = pd.read_csv("training.csv",sep=',',header=0,encoding='utf-8')
df_validation = pd.read_csv("validation.csv",sep=',',header=0,encoding='utf-8')
# print(df_validation.head(50))
train_X = df_train['content']
train_Y = df_train['label']
validate_X = df_validation['content']
validate_Y = df_validation['label']
train_Y = tf.squeeze(train_Y)
validate_Y = tf.squeeze(validate_Y)

# 预处理

In [None]:
train_X.shape,train_Y.shape,validate_X.shape,validate_Y.shape

In [None]:
import jieba
import string
def clean_CN(corpus):
    stop = []
    with open(r"D:\大创项目\LDA\stopwords\CNstopwords.txt", 'r', encoding='utf-8') as f:
        for lines in f:
            stop.append(lines.strip())
    stop = set(stop)
   
    exclude = set(string.punctuation)  # 标点符号
    clean_corpus = []
    for doc in corpus:
        words = jieba.lcut(doc)
        stop_free = [i for i in words if (i not in stop) & (i.isalpha())]
        clean_corpus.append(stop_free)
    return clean_corpus


In [None]:
train_X = clean_CN(train_X)
validate_X = clean_CN(validate_X)

# 特征提取 

In [None]:
from gensim import corpora
texts = train_X.copy()
texts.extend(validate_X)
print(len(texts))
print(len(train_X))
print(len(validate_X))

In [None]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=10, no_above=0.5)
dictionary.compactify() 
print(dictionary)

In [None]:
train_X = [dictionary.doc2bow(text) for text in train_X]
validate_X = [dictionary.doc2bow(text) for text in validate_X]


In [None]:
new_train_X = []
new_validate_X = []
for i in range(len(train_X)):
    temp = []
    doc = train_X[i]
    for k in range(len(doc)):
        count = doc[k][1]
        for j in range(count):
            temp.append(doc[k][0])
    new_train_X.append(temp)
print(len(new_train_X))
print(new_train_X[10])

for i in range(len(validate_X)):
    temp = []
    doc = validate_X[i]
    for k in range(len(doc)):
        count = doc[k][1]
        for j in range(count):
            temp.append(doc[k][0])
    new_validate_X.append(temp)
print(len(new_validate_X))
print(new_validate_X[10])

In [None]:
from tensorflow.keras.preprocessing import sequence
train_x = sequence.pad_sequences(new_train_X, maxlen=15)
validate_x = sequence.pad_sequences(new_validate_X, maxlen=15)

In [None]:
train_x.shape,validate_x.shape

# 构建网络

In [None]:
max_features = 4963
embedding_dims = 25
maxlen = 20
filters =250
kernel_size = 3
hidden_dims = 250
batch_size = 128
epochs = 100


model = keras.Sequential(
    [
        keras.layers.Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen),
        keras.layers.Dropout(0.2),
        
        keras.layers.Conv1D(filters,
                 kernel_size,
                 padding='same',
                 activation='relu',
                 strides=1,
                ),
        
        keras.layers.Activation('relu'),
        keras.layers.MaxPooling1D(),
        
        
        keras.layers.Conv1D(filters,
                 kernel_size,
                 padding='same',
                 activation='relu',
                 strides=1),
        keras.layers.Activation('relu'),
        keras.layers.MaxPooling1D(),
        
        keras.layers.Dropout(0.2),
        
                
        keras.layers.Conv1D(filters,
                 kernel_size,
                 padding='same',
                 activation='relu',
                 strides=1),
        keras.layers.Activation('relu'),
        keras.layers.MaxPooling1D(),
        
        
        keras.layers.Flatten(),
        keras.layers.Dense(hidden_dims),
        keras.layers.Dropout(0.2),
        keras.layers.Activation('relu'),
        keras.layers.Dense(1),
        keras.layers.Activation('sigmoid'),
        
    ]

)
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])
model.summary()

model.fit(train_x, train_Y,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(validate_x, validate_Y))


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 25)            124075    
_________________________________________________________________
dropout (Dropout)            (None, 20, 25)            0         
_________________________________________________________________
conv1d (Conv1D)              (None, 20, 250)           19000     
_________________________________________________________________
activation (Activation)      (None, 20, 250)           0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 10, 250)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 10, 250)           187750    
_________________________________________________________________
activation_1 (Activation)    (None, 10, 250)           0