In [1]:
from keras.layers import Dropout, Dense,Input,Embedding,Flatten, AveragePooling2D, Conv2D,Reshape, Add
from keras.models import Sequential,Model
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import os

MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
DATA_DIR = '/content/drive/MyDrive/Colab Notebooks/диплом'

def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=1000):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
    f = open(os.path.join(DATA_DIR, 'glove.6B.100d.txt'), encoding="utf8") ## GloVe file which could be download https://nlp.stanford.edu/projects/glove/
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index,embeddings_index)



def Build_Model_CNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=1000, EMBEDDING_DIM=100, dropout=0.5):

    """
        def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
        word_index in word index ,
        embeddings_index is embeddings index, look at data_helper.py
        nClasses is number of classes,
        MAX_SEQUENCE_LENGTH is maximum lenght of text sequences,
        EMBEDDING_DIM is an int value for dimention of word embedding look at data_helper.py
    """

    model = Sequential()
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) !=len(embedding_vector):
                print("could not broadcast input array from shape",str(len(embedding_matrix[i])),
                                 "into shape",str(len(embedding_vector))," Please make sure your"
                                 " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)

            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)

    # applying a more complex convolutional approach
    convs = []
    filter_sizes = []
    layer = 5
    print("Filter  ",layer)
    for fl in range(0,layer):
        filter_sizes.append((fl+2,fl+2))

    node = 128
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    emb = Reshape((1000,10, 10), input_shape=(1000,100))(embedded_sequences)

    for fsz in filter_sizes:
        l_conv = Conv2D(node, padding="same", kernel_size=fsz, activation='relu')(emb)
        l_pool = AveragePooling2D(pool_size=(5,1), padding="same")(l_conv)
        #l_pool = Dropout(0.25)(l_pool)
        convs.append(l_pool)

    l_merge = Add()(convs)
    l_cov1 = Conv2D(node, (5,5), padding="same", activation='relu')(l_merge)
    l_cov1 = AveragePooling2D(pool_size=(5,2), padding="same")(l_cov1)
    l_cov2 = Conv2D(node, (5,5), padding="same", activation='relu')(l_cov1)
    l_pool2 = AveragePooling2D(pool_size=(5,2), padding="same")(l_cov2)
    l_cov2 = Dropout(dropout)(l_pool2)
    l_flat = Flatten()(l_cov2)
    l_dense = Dense(128, activation='relu')(l_flat)
    l_dense = Dropout(dropout)(l_dense)

    preds = Dense(nclasses, activation='softmax')(l_dense)
    model = Model(sequence_input, preds)

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])



    return model

In [2]:
from __future__ import print_function
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os

os.environ['KERAS_BACKEND'] = 'theano'

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Add
from keras.models import Model

MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
DATA_DIR = '/content/drive/MyDrive/Colab Notebooks/диплом'

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

data_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/диплом/export.csv', sep=',')
print(data_train.shape)

(2716, 7)


In [3]:
data = pd.DataFrame()
data['content'] = data_train['Содержание сообщения']
data['label'] = data_train['Оригинал сообщения']
data.dropna() 
data.head()

Unnamed: 0,content,label
0,"Действительно,кто эту провозгласил примадонной...",0
1,"Схем много,спасибо,но по всем связать пенсии н...",0
2,"И найдутся же дураки,кто это чмо купит.",0
3,"Из бисера делаю эти подснежники,а хотелось бы ...",0
4,"Да пристрелить уже этого педика ,из ума выжил ...",0


In [4]:
texts = []
labels = []

for idx in range(data.content.shape[0]):
    try:
        if len(data.label[idx]) == 1:
            text = BeautifulSoup(data.content[idx])
            # print(data.label[idx])
            texts.append(clean_str(text.get_text()))
            labels.append(data.label[idx])
    except:
        print(idx)
    

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

379


  text = BeautifulSoup(data.content[idx])


2626
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2705
Found 13739 unique tokens.




In [5]:
data_pad = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print(('Shape of data tensor:', data_pad.shape))
print(('Shape of label tensor:', labels.shape))

indices = np.arange(data_pad.shape[0])
np.random.shuffle(indices)
data_pad = data_pad[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data_pad.shape[0])

x_train = data_pad[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data_pad[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set ')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

('Shape of data tensor:', (2694, 1000))
('Shape of label tensor:', (2694, 6))
Number of positive and negative reviews in traing and validation set 
[1943.   27.  165.   10.    3.    8.]
[472.   3.  54.   2.   1.   6.]


In [6]:
X_train_Glove, X_test_Glove, word_index, embeddings_index = loadData_Tokenizer(x_train, x_val)


model_CNN = Build_Model_CNN_Text(word_index,embeddings_index, 6)
# print(len(X_train_Glove))

model_CNN.summary()

model_CNN.fit(X_train_Glove, y_train,
                              validation_data=(X_test_Glove, y_val),
                              epochs=1000,
                              batch_size=128,
                              verbose=2)

predicted = model_CNN.predict(X_test_Glove)

predicted = np.argmax(predicted, axis=1)


print(metrics.classification_report(y_val, predicted))

AttributeError: ignored