In [12]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, LSTM, GRU, Dropout, Reshape, Flatten, concatenate, Input, MaxPooling1D, Conv1D,GlobalAveragePooling1D, GlobalMaxPooling1D, Embedding
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
import numpy as np
from sklearn.model_selection import train_test_split

In [13]:
data = pd.read_csv('train.csv')
data.dropna(inplace = True)

In [14]:
tokenizer = RegexpTokenizer(r'\w+')
data['tokens'] = data['TEXT'].apply(lambda x: tokenizer.tokenize(x))

pos = []
neg = []
for l in data.truth:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)
        
data['Pos']= pos
data['Neg']= neg

In [15]:
data = data[['TEXT', 'tokens', 'truth', 'Pos', 'Neg']]
data.head()

Unnamed: 0,TEXT,tokens,truth,Pos,Neg
0,there pizza place couple door couple kid worki...,"[there, pizza, place, couple, door, couple, ki...",1,1,0
1,pour hot water pitcher americano shot pouring ...,"[pour, hot, water, pitcher, americano, shot, p...",1,1,0
2,ombre pink drink cool lime base think lot cust...,"[ombre, pink, drink, cool, lime, base, think, ...",1,1,0
3,tall americano double sleeve cup,"[tall, americano, double, sleeve, cup]",1,1,0
4,print blank receipt paper write closing list c...,"[print, blank, receipt, paper, write, closing,...",1,1,0


In [16]:
data_train, data_test = train_test_split(data, 
                                         test_size=0.10, 
                                         random_state=42)

In [17]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))

print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

1885388 words total, with a vocabulary size of 105829
Max sentence length is 5768


In [18]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

207510 words total, with a vocabulary size of 27817
Max sentence length is 1622


In [31]:
import gensim

model = gensim.models.Word2Vec(sentences = data['tokens'], size=EMBEDDING_DIM, window=5, workers=4, min_count=1)
filename = 'gensim_word2vec_model.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [8]:
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [19]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [20]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [21]:
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 300

In [22]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["TEXT"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["TEXT"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 105829 unique tokens.


In [23]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(105830, 300)


In [24]:
test_sequences = tokenizer.texts_to_sequences(data_test["TEXT"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [31]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    
    
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [32]:
label_names = ['Pos', 'Neg']
y_train = data_train[label_names].values
x_train = train_cnn_data
y_tr = y_train

In [33]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, len(list(label_names)))

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     31749000    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 199, 200)     120200      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 198, 200)     180200      embedding_1[0][0]                
____________________________________________________________________________________________

In [35]:
num_epochs = 50
batch_size = 32

hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Train on 102238 samples, validate on 11360 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [36]:
model.save('CNN.h5')

In [37]:
from keras.models import load_model

hehe = load_model('CNN.h5')

In [38]:
from sklearn.metrics import classification_report

predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)

y_pred_bool = np.argmax(predictions, axis=1)

print(classification_report(data_test['truth'], y_pred_bool))

              precision    recall  f1-score   support

           0       0.58      0.01      0.02     11938
           1       0.05      0.88      0.09       685

    accuracy                           0.06     12623
   macro avg       0.31      0.44      0.06     12623
weighted avg       0.55      0.06      0.02     12623



In [42]:
temp = [1 if x==0 else 0 for x in y_pred_bool]
print(classification_report(data_test['truth'], temp))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     11938
           1       0.42      0.12      0.19       685

    accuracy                           0.94     12623
   macro avg       0.69      0.56      0.58     12623
weighted avg       0.92      0.94      0.93     12623



In [112]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["TEXT"].tolist())
temprun = tokenizer.texts_to_sequences(['WHAT THE FUCK BROTHAAAA'])
temprun = pad_sequences(temprun, maxlen=200)

predictions = model.predict(temprun, batch_size=1024, verbose=1)
y_pred = np.argmax(predictions, axis=1)
y_pred



array([1])

In [119]:
temp = pd.DataFrame(y_pred_bool)
data_test['truth'].value_counts()
#temp[0].value_counts()

1    12484
0      139
Name: 0, dtype: int64