In [1]:
import tensorflow as tf
from tensorflow import keras
import jieba
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from tensorflow.keras.layers import Embedding, LSTM, Dense
import sklearn
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
from collections import Counter
import os

print(tf.__version__)

2.1.0


In [2]:
data_path= "./train_dev_data/"

os.listdir(data_path)

['answer_example.csv',
 'cn_dev.csv',
 'cn_train.csv',
 'en_dev.csv',
 'en_train.csv',
 '评价指标说明.md']

In [3]:
data_frame=pd.read_csv(data_path+'cn_train.csv',index_col='ID')
data_frame.head()

Unnamed: 0_level_0,Dialogue_id,Utterance_id,Speaker,Sentence,Label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,卖油条小刘,我说,0
1,0,1,保姆小张,干啥子嘛？,0
2,0,2,卖油条小刘,你看你往星空看月朦胧，鸟朦胧,1
3,0,3,卖油条小刘,咱是不是歇一下这双，疲惫的双腿？,0
4,0,4,卖油条小刘,快把我累死了,0


In [4]:
dialogue_size=data_frame['Dialogue_id'].max()+1
print(dialogue_size)

348


In [5]:
sub_data_frame_list=[]
for i in range(dialogue_size):
    sub_data_frame_list.append(data_frame[data_frame['Dialogue_id']==i])
sub_data_frame_list[0]

Unnamed: 0_level_0,Dialogue_id,Utterance_id,Speaker,Sentence,Label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,卖油条小刘,我说,0
1,0,1,保姆小张,干啥子嘛？,0
2,0,2,卖油条小刘,你看你往星空看月朦胧，鸟朦胧,1
3,0,3,卖油条小刘,咱是不是歇一下这双，疲惫的双腿？,0
4,0,4,卖油条小刘,快把我累死了,0
5,0,5,卖油条小刘,我说亲爱的大姐你贵姓啊？,1
6,0,6,保姆小张,我免贵姓张我叫张凤姑,0
7,0,7,卖油条小刘,凤姑,0
8,0,8,保姆小张,天天买你的油条还没有问过师傅，你贵姓啊？,0
9,0,9,卖油条小刘,我免贵，我姓刘，我叫刘建军,0


In [6]:
def stop_words(path):
    with open(path,encoding='utf-8') as f:
        return [l.strip() for l in f]

In [7]:
words_3d_list=[]
for sub_data_frame in sub_data_frame_list:
    words_2d_list=[]
    for sentence in sub_data_frame['Sentence']:
        # words_2d_list.append([x for x in jieba.cut(sentence) if x not in stop_words('./stop_words.txt')])
        words_2d_list.append([x for x in jieba.cut(sentence)])
    words_3d_list.append(words_2d_list)
print(len(words_3d_list)==dialogue_size)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\97110\AppData\Local\Temp\jieba.cache
Loading model cost 0.715 seconds.
Prefix dict has been built successfully.


True


In [8]:
print(words_3d_list[0][3])

['咱', '是不是', '歇', '一下', '这', '双', '，', '疲惫', '的', '双腿', '？']


In [9]:
# context_size:上下文窗口大小。
context_size=3

new_words_3d_list=[]
for i,words_2d_list in enumerate(words_3d_list):
    new_words_2d_list=[]
    for j in range(len(words_2d_list)):
        words=[]
        left = j-context_size if (j-context_size)>=0 else 0
        right= j+context_size if (j+context_size)<len(words_2d_list) else len(words_2d_list)-1
        for context_step in range(left,right+1):
            words.extend(words_2d_list[context_step])
        new_words_2d_list.append(words)
    new_words_3d_list.append(new_words_2d_list)
print(len(new_words_3d_list)==dialogue_size)
print(new_words_3d_list[0][3])

True
['我', '说', '干', '啥子', '嘛', '？', '你', '看', '你', '往', '星空', '看', '月', '朦胧', '，', '鸟', '朦胧', '咱', '是不是', '歇', '一下', '这', '双', '，', '疲惫', '的', '双腿', '？', '快', '把', '我', '累死', '了', '我', '说', '亲爱', '的', '大姐', '你', '贵姓', '啊', '？', '我免', '贵姓', '张', '我', '叫', '张凤姑']


In [10]:
length_list=[]
for new_words_2d_list in new_words_3d_list:
    for words in new_words_2d_list:
        length_list.append(len(words))

print(sorted(length_list)[len(length_list) // 2])
print(min(length_list)) 
print(max(length_list))

65
5
286


In [11]:
max_length=256

In [12]:
word_set=set()
for new_words_2d_list in new_words_3d_list:
    for words in new_words_2d_list:
        word_set=word_set.union(set(words))
vcab_size=len(word_set)
print(vcab_size)

13827


In [13]:
# 索引从1开始
word_index = 1
word2index = {}
for word in word_set:
    word2index[word] = word_index
    word_index += 1

In [14]:
generate_data = np.zeros((len(data_frame["Sentence"]), max_length))
i=-1
for new_words_2d_list in new_words_3d_list:
    for words in new_words_2d_list:
        i+=1
        for j, word in enumerate(words):
            if j >= max_length:
                continue
            generate_data[i,j] = word2index.get(word, 0)
print(generate_data[0])
print(generate_data[-1])

[ 7171.  7597.   528.  9269. 10628.  4587.  2836.  2105.  2836.  8820.
  4674.  2105.  9359.  3950. 11805.  9407.  3950.  6388.  1294.  1500.
   363.  9030.  7072. 11805.  1012.  3324.  7999.  4587.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
     0

In [15]:
train_dataset=generate_data
train_lables = np.array(data_frame["Label"])
print(train_dataset.shape)
print(train_lables.shape)

(12677, 256)
(12677,)


In [16]:
count_result = Counter(train_lables)
print("幽默：", count_result[1], " , 占比：", count_result[1] / len(train_lables))
print("非幽默：", count_result[0], " , 占比：", count_result[0] / len(train_lables))

幽默： 3646  , 占比： 0.2876074781099629
非幽默： 9031  , 占比： 0.7123925218900371


In [17]:
class Validation_Metrics(Callback):

    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(
            validate_data_x))).round()
        val_targ = validate_data_y
        _val_f1 = f1_score(val_targ, val_predict, pos_label=1)
        _val_recall = recall_score(val_targ, val_predict, pos_label=1)
        _val_precision = precision_score(val_targ, val_predict, pos_label=1)
        _val_acc = accuracy_score(val_targ, val_predict)

        print('\nf1:', _val_f1, ', recall:', _val_recall, ', precision:', _val_precision, ', accuracy:', _val_acc)

        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        return

In [None]:
model = keras.Sequential()
model.add(Embedding(vcab_size + 1, 256, input_length=max_length, mask_zero=True))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

# test_data_df, _, _, _ = prapare_without_tokenize(train_dev_data_path + "cn_dev.csv")
# test_generate_data = generate_data(test_data_df, max_length)

validation_metrics = Validation_Metrics()
train_data_x, validate_data_x, train_data_y, validate_data_y = train_test_split(train_dataset, train_lables,
                                                                                    test_size=0.2, random_state=24)
history = model.fit(train_data_x, train_data_y, epochs=20, batch_size=256, callbacks=[validation_metrics],
                    validation_data=(validate_data_x, validate_data_y))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 256, 256)          3539968   
_________________________________________________________________
lstm (LSTM)                  (None, 256, 256)          525312    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 4,270,721
Trainable params: 4,270,721
Non-trainable params: 0
_________________________________________________________________
Train on 10141 samples, validate on 2536 samples
Epoch 1/20