In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import codecs
import jieba
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,f1_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedShuffleSplit

random.seed(14)

In [2]:
#读文件
data = pd.read_csv('/Users/mac/Desktop/2022春课件/时序/商品清单（new）.csv')
#标签映射为整数
listType = data['一级分类'].unique()
com_map = dict.fromkeys(listType)
for i in range(len(listType)):
    com_map[listType[i]] = i
data['一级分类_整数'] = data['一级分类'].map(com_map)

listType = data['二级分类'].unique()
com_map = dict.fromkeys(listType)
for i in range(len(listType)):
    com_map[listType[i]] = i
data['二级分类_整数'] = data['二级分类'].map(com_map)

listType = data['三级分类'].unique()
com_map = dict.fromkeys(listType)
for i in range(len(listType)):
    com_map[listType[i]] = i
data['三级分类_整数'] = data['三级分类'].map(com_map)
#去停用词
stopkey = [w.strip() for w in codecs.open('/Users/mac/Downloads/呆萌的停用词表.txt', 'r').readlines()]
data['segment'] = data['商品名称'].apply(lambda x:jieba.lcut(x))
for i in range(len(data)):
    words = data['segment'][i].copy()
    for x in words:
        if x in stopkey:
            data['segment'][i].remove(x)
data.head()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/_t/zp2z4_4x4gg78j4gh86h6z5m0000gn/T/jieba.cache
Loading model cost 0.847 seconds.
Prefix dict has been built successfully.


Unnamed: 0,商品名称,一级分类,二级分类,三级分类,一级分类_整数,二级分类_整数,三级分类_整数,segment
0,**N蓝妹啤酒易拉罐3,酒类,啤酒,啤酒,0,0,0,"[N, 蓝妹, 啤酒, 易拉罐]"
1,农夫水溶C100青皮桔445ml,饮料,果蔬汁,果蔬饮料,1,1,1,"[农夫, 水溶, C100, 青皮, 桔, 445ml]"
2,N七匹狼（软灰）,烟类,香烟,软盒香烟,2,2,2,"[N, 七匹狼, 软灰]"
3,N七匹狼（软红）,烟类,香烟,软盒香烟,2,2,2,"[N, 七匹狼, 软红]"
4,农夫山泉水550ml,饮料,水,矿泉水,1,3,3,"[农夫山泉, 水, 550ml]"


In [3]:
# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)
# for train_index, test_index in split.split(data, data['三级分类_整数']):
#     train_set = data[data.index.isin(train_index)]
#     test_set = data[data.index.isin(test_index)]


from tensorflow import keras
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(data['segment'])
vocab=tokenizer.word_index
X = data['segment']
maxlen = 30
Y = data['三级分类_整数']
kinds = len(Y.unique())
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.7, random_state = 123)
X_train_word_ids=tokenizer.texts_to_sequences(X_train)
X_test_word_ids = tokenizer.texts_to_sequences(X_test)
#将超过固定值的部分截掉，不足的在最前面用0填充
X_train_padded_seqs=keras.preprocessing.sequence.pad_sequences(X_train_word_ids, maxlen = maxlen)
X_test_padded_seqs=keras.preprocessing.sequence.pad_sequences(X_test_word_ids,  maxlen = maxlen)
#将标签转换为one-hot编码
one_hot_labels = keras.utils.to_categorical(Y_train, num_classes=kinds)

#### 1、focal_loss损失函数

In [4]:
import tensorflow as tf
def focal_loss(y_true, y_pred):
    gamma = 2.0
    alpha = 0.25
    pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
    pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
    return -keras.backend.sum(alpha * keras.backend.pow(1. - pt_1, gamma) * keras.backend.log(pt_1))-keras.backend.sum((1-alpha) * keras.backend.pow( pt_0, gamma) * keras.backend.log(1. - pt_0))

In [5]:
main_input = keras.layers.Input(shape=(maxlen,), dtype='float64')
# 词嵌入（使用预训练的词向量）
embedder = keras.layers.Embedding(len(vocab) + 1, 300, input_length=maxlen)
embed = embedder(main_input)
# 词窗大小分别为3,4,5
cnn1 = keras.layers.Conv1D(128, 2, padding='valid', strides=1, activation='relu')(embed)
cnn1 = keras.layers.MaxPooling1D(pool_size=29)(cnn1)
cnn2 = keras.layers.Conv1D(128, 3, padding='valid', strides=1, activation='relu')(embed)
cnn2 = keras.layers.MaxPooling1D(pool_size=28)(cnn2)
cnn3 = keras.layers.Conv1D(64, 4, padding='valid', strides=1, activation='relu')(embed)
cnn3 = keras.layers.MaxPooling1D(pool_size=27)(cnn3)
# 合并三个模型的输出向量
cnn = keras.layers.concatenate([cnn1, cnn2, cnn3], axis=-1)
flat = keras.layers.Flatten()(cnn)
drop = keras.layers.Dropout(0.3)(flat)
main_output = keras.layers.Dense(kinds, activation='softmax')(drop)
modelCNN_loss = keras.models.Model(inputs=main_input, outputs=main_output)
modelCNN_loss.compile(loss=[focal_loss], optimizer='adam', metrics=['accuracy'], lr=0.0001)
modelCNN_loss.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 30, 300)      3864600     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 29, 128)      76928       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 28, 128)      115328      embedding[0][0]                  
______________________________________________________________________________________________

In [6]:
modelCNN_loss.fit(X_train_padded_seqs, one_hot_labels,  epochs=10, batch_size=200, validation_split = 0.2)

Train on 8383 samples, validate on 2096 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x153c5d950>

In [7]:
result = modelCNN_loss.predict(X_test_padded_seqs)  # 预测样本属于每个类别的概率
Y_predict = np.argmax(result, axis=1)  # 获得最大概率对应的标签
print('准确率', accuracy_score(Y_test, Y_predict))
print('平均f1-score:', f1_score(Y_test, Y_predict, average='weighted'))
print(classification_report(Y_predict,Y_test))

准确率 0.776046304541407
平均f1-score: 0.7777820684188397
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        52
           1       0.80      0.74      0.77        80
           2       0.79      0.79      0.79        24
           3       0.78      1.00      0.88        18
           4       0.93      0.94      0.94       158
           5       0.86      0.85      0.85        85
           6       0.81      0.77      0.79       140
           7       0.84      0.67      0.74        39
           8       0.29      0.64      0.40        11
           9       0.93      0.83      0.88       119
          10       0.88      0.90      0.89        31
          11       0.81      0.72      0.76       141
          12       0.74      0.86      0.79        29
          13       0.84      0.86      0.85        50
          14       0.75      0.75      0.75         4
          15       0.54      0.73      0.62        26
          16       0.89     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### 2、BorderlineSMOTE

In [8]:
#重复过少的样本，使其不少于6个，以便进行后续SMOTE
while(min(data['三级分类_整数'].value_counts())<10):
    data = data.append(data[data['三级分类_整数'].isin(data['三级分类_整数'].value_counts()[data['三级分类_整数'].value_counts()<10].index)])
X = data['segment']
Y = data['三级分类_整数']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.7, random_state = 123)
X_train_word_ids=tokenizer.texts_to_sequences(X_train)
X_test_word_ids = tokenizer.texts_to_sequences(X_test)
#将超过固定值的部分截掉，不足的在最前面用0填充
X_train_padded_seqs=keras.preprocessing.sequence.pad_sequences(X_train_word_ids, maxlen = maxlen)
X_test_padded_seqs=keras.preprocessing.sequence.pad_sequences(X_test_word_ids,  maxlen = maxlen)
#将标签转换为one-hot编码
one_hot_labels = keras.utils.to_categorical(Y_train, num_classes=kinds)

Y_dict = dict(Y_train.value_counts())
for key in Y_dict:
    if(Y_dict[key]<50):
        Y_dict[key] = 50
X_resample, Y_resample = SMOTE(random_state = 123, k_neighbors=2, sampling_strategy = Y_dict).fit_resample(X_train_padded_seqs, one_hot_labels)
X_train_padded_seqs.shape, one_hot_labels.shape, X_resample.shape, Y_resample.shape

((10567, 30), (10567, 133), (12586, 30), (12586, 133))

In [9]:
main_input = keras.layers.Input(shape=(maxlen,), dtype='float64')
# 词嵌入（使用预训练的词向量）
embedder = keras.layers.Embedding(len(vocab) + 1, 300, input_length=maxlen)
embed = embedder(main_input)
# 词窗大小分别为3,4,5
cnn1 = keras.layers.Conv1D(128, 2, padding='valid', strides=1, activation='relu')(embed)
cnn1 = keras.layers.MaxPooling1D(pool_size=29)(cnn1)
cnn2 = keras.layers.Conv1D(128, 3, padding='valid', strides=1, activation='relu')(embed)
cnn2 = keras.layers.MaxPooling1D(pool_size=28)(cnn2)
cnn3 = keras.layers.Conv1D(128, 4, padding='valid', strides=1, activation='relu')(embed)
cnn3 = keras.layers.MaxPooling1D(pool_size=27)(cnn3)
# 合并三个模型的输出向量
cnn = keras.layers.concatenate([cnn1, cnn2, cnn3], axis=-1)
flat = keras.layers.Flatten()(cnn)
drop = keras.layers.Dropout(0.3)(flat)
main_output = keras.layers.Dense(kinds, activation='softmax')(drop)
modelCNN_balence = keras.models.Model(inputs=main_input, outputs=main_output)
modelCNN_balence.compile(loss=[focal_loss], optimizer='adam', metrics=['accuracy'], lr=0.0001)
modelCNN_balence.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 300)      3864600     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 29, 128)      76928       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 28, 128)      115328      embedding_1[0][0]                
____________________________________________________________________________________________

In [10]:
modelCNN_balence.fit(X_resample, Y_resample,  epochs=10, batch_size=200, validation_split = 0.2)

Train on 10068 samples, validate on 2518 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x15a77b310>

In [11]:
result = modelCNN_balence.predict(X_test_padded_seqs)  # 预测样本属于每个类别的概率
Y_predict = np.argmax(result, axis=1)  # 获得最大概率对应的标签
print('测试准确率', accuracy_score(Y_test, Y_predict))
print('平均f1-score:', f1_score(Y_test, Y_predict, average='weighted'))
print(classification_report(Y_predict,Y_test))

测试准确率 0.8
平均f1-score: 0.7991523026600292
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        56
           1       0.80      0.80      0.80        79
           2       0.77      0.95      0.85        21
           3       0.82      1.00      0.90        28
           4       0.97      0.90      0.94       173
           5       0.89      0.90      0.89        89
           6       0.82      0.87      0.84       121
           7       0.81      0.69      0.75        32
           8       0.32      0.88      0.47         8
           9       0.90      0.85      0.88       107
          10       0.91      0.94      0.92        32
          11       0.81      0.70      0.75       149
          12       0.91      0.85      0.88        34
          13       0.87      0.85      0.86        61
          14       0.80      0.80      0.80         5
          15       0.76      0.70      0.73        37
          16       0.96      0.91      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##### 寻找最优的上采样数

In [None]:
for i in [60, 65, 70]:
    Y_dict = dict(Y_train.value_counts())
    for key in Y_dict:
        if(Y_dict[key]<i):
            Y_dict[key] = i
    X_resample, Y_resample = SMOTE(random_state = 123, k_neighbors=2, sampling_strategy = Y_dict).fit_resample(X_train_padded_seqs, one_hot_labels)
    main_input = keras.layers.Input(shape=(maxlen,), dtype='float64')
    # 词嵌入（使用预训练的词向量）
    embedder = keras.layers.Embedding(len(vocab) + 1, 300, input_length=maxlen)
    embed = embedder(main_input)
    # 词窗大小分别为3,4,5
    cnn1 = keras.layers.Conv1D(128, 2, padding='valid', strides=1, activation='relu')(embed)
    cnn1 = keras.layers.MaxPooling1D(pool_size=29)(cnn1)
    cnn2 = keras.layers.Conv1D(128, 3, padding='valid', strides=1, activation='relu')(embed)
    cnn2 = keras.layers.MaxPooling1D(pool_size=28)(cnn2)
    cnn3 = keras.layers.Conv1D(128, 4, padding='valid', strides=1, activation='relu')(embed)
    cnn3 = keras.layers.MaxPooling1D(pool_size=27)(cnn3)
    # 合并三个模型的输出向量
    cnn = keras.layers.concatenate([cnn1, cnn2, cnn3], axis=-1)
    flat = keras.layers.Flatten()(cnn)
    drop = keras.layers.Dropout(0.3)(flat)
    main_output = keras.layers.Dense(kinds, activation='softmax')(drop)
    modelCNN_balence2 = keras.models.Model(inputs=main_input, outputs=main_output)
    modelCNN_balence2.compile(loss=[focal_loss], optimizer='adam', metrics=['accuracy'], lr=0.0001)
    modelCNN_balence2.fit(X_resample, Y_resample,  epochs=10, batch_size=200, validation_split = 0.2)
    result = modelCNN_balence2.predict(X_test_padded_seqs)  # 预测样本属于每个类别的概率
    Y_predict = np.argmax(result, axis=1)  # 获得最大概率对应的标签
    print('i = ',i)
    print('测试准确率', accuracy_score(Y_test, Y_predict))
    print('平均f1-score:', f1_score(Y_test, Y_predict, average='weighted'))
#60最好

###### 60最好

In [17]:
Y_dict = dict(Y_train.value_counts())
for key in Y_dict:
    if(Y_dict[key]<60):
        Y_dict[key] = 60
X_resample, Y_resample = SMOTE(random_state = 123, k_neighbors=2, sampling_strategy = Y_dict).fit_resample(X_train_padded_seqs, one_hot_labels)
X_train_padded_seqs.shape, one_hot_labels.shape, X_resample.shape, Y_resample.shape

((10567, 30), (10567, 133), (13330, 30), (13330, 133))

In [18]:
main_input = keras.layers.Input(shape=(maxlen,), dtype='float64')
# 词嵌入（使用预训练的词向量）
embedder = keras.layers.Embedding(len(vocab) + 1, 300, input_length=maxlen)
embed = embedder(main_input)
# 词窗大小分别为3,4,5
cnn1 = keras.layers.Conv1D(128, 2, padding='valid', strides=1, activation='relu')(embed)
cnn1 = keras.layers.MaxPooling1D(pool_size=29)(cnn1)
cnn2 = keras.layers.Conv1D(128, 3, padding='valid', strides=1, activation='relu')(embed)
cnn2 = keras.layers.MaxPooling1D(pool_size=28)(cnn2)
cnn3 = keras.layers.Conv1D(128, 4, padding='valid', strides=1, activation='relu')(embed)
cnn3 = keras.layers.MaxPooling1D(pool_size=27)(cnn3)
# 合并三个模型的输出向量
cnn = keras.layers.concatenate([cnn1, cnn2, cnn3], axis=-1)
flat = keras.layers.Flatten()(cnn)
drop = keras.layers.Dropout(0.3)(flat)
main_output = keras.layers.Dense(kinds, activation='softmax')(drop)
modelCNN_balence = keras.models.Model(inputs=main_input, outputs=main_output)
modelCNN_balence.compile(loss=[focal_loss], optimizer='adam', metrics=['accuracy'], lr=0.0001)
modelCNN_balence.summary()

Model: "model_20"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding_20 (Embedding)        (None, 30, 300)      3864600     input_21[0][0]                   
__________________________________________________________________________________________________
conv1d_60 (Conv1D)              (None, 29, 128)      76928       embedding_20[0][0]               
__________________________________________________________________________________________________
conv1d_61 (Conv1D)              (None, 28, 128)      115328      embedding_20[0][0]               
___________________________________________________________________________________________

In [19]:
modelCNN_balence.fit(X_resample, Y_resample,  epochs=10, batch_size=200, validation_split = 0.2)

Train on 10664 samples, validate on 2666 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1d3fcfed0>

In [20]:
result = modelCNN_balence.predict(X_test_padded_seqs)  # 预测样本属于每个类别的概率
Y_predict = np.argmax(result, axis=1)  # 获得最大概率对应的标签
print('测试准确率', accuracy_score(Y_test, Y_predict))
print('平均f1-score:', f1_score(Y_test, Y_predict, average='weighted'))
print(classification_report(Y_predict,Y_test))

测试准确率 0.8026490066225166
平均f1-score: 0.8016795808863912
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        57
           1       0.85      0.80      0.82        84
           2       0.81      0.95      0.88        22
           3       0.88      1.00      0.94        30
           4       0.97      0.92      0.95       170
           5       0.91      0.92      0.92        89
           6       0.85      0.80      0.82       137
           7       0.78      0.70      0.74        30
           8       0.36      0.80      0.50        10
           9       0.93      0.87      0.90       108
          10       0.91      0.94      0.92        32
          11       0.77      0.78      0.77       127
          12       0.81      0.79      0.80        33
          13       0.88      0.87      0.88        61
          14       0.80      0.29      0.42        14
          15       0.74      0.81      0.77        31
          16       0.96  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
