In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, LSTM, GRU, Dropout, Reshape, Flatten, concatenate, Input, MaxPooling1D, Conv1D,GlobalAveragePooling1D, GlobalMaxPooling1D, Embedding
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
import numpy as np
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
data = pd.read_csv('train.csv')
data.dropna(inplace = True)

In [4]:
tokenizer = RegexpTokenizer(r'\w+')
data['tokens'] = data['TEXT'].apply(lambda x: tokenizer.tokenize(x))

pos = []
neg = []
for l in data.truth:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)
        
data['Pos']= pos
data['Neg']= neg

In [5]:
data = data[['TEXT', 'tokens', 'truth', 'Pos', 'Neg']]
data.head()

Unnamed: 0,TEXT,tokens,truth,Pos,Neg
0,hahaha wanted say comment made laugh,"[hahaha, wanted, say, comment, made, laugh]",0,0,1
1,feel ya,"[feel, ya]",0,0,1
2,kind superior power trajectory definitely laun...,"[kind, superior, power, trajectory, definitely...",0,0,1
3,delete script tampermonkey option puushvqwgafe...,"[delete, script, tampermonkey, option, puushvq...",0,0,1
4,end sequel trilogy,"[end, sequel, trilogy]",0,0,1


In [6]:
data_train = data

In [6]:
data_train, data_test = train_test_split(data, 
                                         test_size=0.001, 
                                         random_state=42)

In [7]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))

print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

2014356 words total, with a vocabulary size of 99367
Max sentence length is 3534


In [8]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

2174 words total, with a vocabulary size of 1301
Max sentence length is 220


In [31]:
import gensim

model = gensim.models.Word2Vec(sentences = data['tokens'], size=EMBEDDING_DIM, window=5, workers=4, min_count=1)
filename = 'gensim_word2vec_model.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [8]:
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [9]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [10]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [11]:
MAX_SEQUENCE_LENGTH = 10000
EMBEDDING_DIM = 300

In [12]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["TEXT"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["TEXT"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 99367 unique tokens.


In [13]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(99368, 300)


In [24]:
test_sequences = tokenizer.texts_to_sequences(data_test["TEXT"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [14]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    
    
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [15]:
label_names = ['Pos', 'Neg']
y_train = data_train[label_names].values
x_train = train_cnn_data
y_tr = y_train

In [16]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, len(list(label_names)))

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 10000)        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 10000, 300)   29810400    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 9999, 200)    120200      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 9998, 200)    180200      embedding_1[0][0]                
____________________________________________________________________________________________

In [None]:
num_epochs = 40
batch_size = 32

hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

In [None]:
model.save('CNN-new.h5')

In [42]:
from sklearn.metrics import classification_report

predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)

temp = [1 if x==0 else 0 for x in y_pred_bool]
print(classification_report(data_test['truth'], temp))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     11938
           1       0.42      0.12      0.19       685

    accuracy                           0.94     12623
   macro avg       0.69      0.56      0.58     12623
weighted avg       0.92      0.94      0.93     12623



In [33]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["TEXT"].tolist())
temprun = tokenizer.texts_to_sequences(['Darkness my old fren, ive come to talk to u agen'])
temprun = pad_sequences(temprun, maxlen=200)

predictions = model.predict(temprun, batch_size=1024, verbose=1)
y_pred = np.argmax(predictions, axis=1)
y_pred



array([1])

In [119]:
temp = pd.DataFrame(y_pred_bool)
data_test['truth'].value_counts()
#temp[0].value_counts()

1    12484
0      139
Name: 0, dtype: int64