In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [0]:

clean_mr_texts = pd.read_csv('drive/My Drive/ML_miniproject_4/MR(2000).csv')


In [0]:
clean_mr_texts[clean_mr_texts.target == 1].head(10)

In [0]:
clean_mr_texts.target.value_counts()

In [0]:
clean_mr_texts.dropna(inplace=True)
clean_mr_texts.reset_index(drop=True,inplace=True)
clean_mr_texts.info()

In [0]:
x = clean_mr_texts.text
y = clean_mr_texts.target

In [0]:
from sklearn.model_selection import train_test_split
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [0]:
print ("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print ("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print ("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

In [0]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

In [0]:
def labelize_ug(comments,label):
    result = []
    prefix = label
    for i, t in zip(comments.index, comments):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [0]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v = labelize_ug(all_x, 'all')

In [0]:
cores = multiprocessing.cpu_count()
model_ug_cbow = Word2Vec(sg=0, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])

In [0]:
%%time
for epoch in range(20):
    model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_cbow.alpha -= 0.002
    model_ug_cbow.min_alpha = model_ug_cbow.alpha

In [0]:
model_ug_sg = Word2Vec(sg=1, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_sg.build_vocab([x.words for x in tqdm(all_x_w2v)])

In [0]:
%%time
for epoch in range(20):
    model_ug_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_sg.alpha -= 0.002
    model_ug_sg.min_alpha = model_ug_sg.alpha

In [0]:
model_ug_cbow.save('drive/My Drive/ML_miniproject_4/w2v_model_ug_cbow.word2vec')
model_ug_sg.save('drive/My Drive/ML_miniproject_4/w2v_model_ug_sg.word2vec')

In [0]:
from gensim.models import KeyedVectors
model_ug_cbow = KeyedVectors.load('drive/My Drive/ML_miniproject_4/w2v_model_ug_cbow.word2vec')
model_ug_sg = KeyedVectors.load('drive/My Drive/ML_miniproject_4/w2v_model_ug_sg.word2vec')

In [17]:
len(model_ug_cbow.wv.vocab.keys())

27139

In [18]:
embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = np.append(model_ug_cbow.wv[w],model_ug_sg.wv[w])
print('Found %s word vectors.' % len(embeddings_index))

Found 27139 word vectors.


In [19]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

Using TensorFlow backend.


In [20]:
len(tokenizer.word_index)

46062

In [0]:
for x in x_train[:5]:
    print (x)

In [0]:
sequences[:5]

In [0]:
length = []
for x in x_train:
    length.append(len(x.split()))

In [24]:
max(length)

1380

In [76]:
x_train_seq = pad_sequences(sequences, maxlen=1380)
print('Shape of data tensor:', x_train_seq.shape)

Shape of data tensor: (1960, 1380)


In [0]:
x_train_seq[:5]

In [0]:
sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=1380)

In [0]:
num_words = 100000
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [0]:
np.array_equal(embedding_matrix[5] ,embeddings_index.get('my'))

In [0]:
seed = 7

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.utils.vis_utils import plot_model
from keras.layers import Conv1D, GlobalMaxPooling1D

In [0]:
model_ptw2v = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=1380, trainable=False)
model_ptw2v.add(e)
model_ptw2v.add(Flatten())
model_ptw2v.add(Dense(256, activation='relu'))
model_ptw2v.add(Dense(1, activation='sigmoid'))
model_ptw2v.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_ptw2v.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
print(model_ptw2v.summary())
plot_model(model_ptw2v, show_shapes=True, to_file='ptw2v_1.png')

In [0]:
from keras.utils.vis_utils import plot_model
model_ptw2v = Sequential()
e = Embedding(100000, 200, input_length=1380)
model_ptw2v.add(e)
model_ptw2v.add(Flatten())
model_ptw2v.add(Dense(256, activation='relu'))
model_ptw2v.add(Dense(1, activation='sigmoid'))
model_ptw2v.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_ptw2v.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
print(model_ptw2v.summary())
plot_model(model_ptw2v, show_shapes=True, to_file='ptw2v_2.png')

In [0]:
model_ptw2v = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=1380, trainable=True)
model_ptw2v.add(e)
model_ptw2v.add(Flatten())
model_ptw2v.add(Dense(256, activation='relu'))
model_ptw2v.add(Dense(1, activation='sigmoid'))
model_ptw2v.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_ptw2v.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
print(model_ptw2v.summary())
plot_model(model_ptw2v, show_shapes=True, to_file='ptw2v_3.png')

## **Convolutional Neural Network**

In [0]:
from keras.layers import Conv1D, GlobalMaxPooling1D

In [0]:
structure_test = Sequential()
e = Embedding(100000, 200, input_length=1380)
structure_test.add(e)
structure_test.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
structure_test.summary()

In [0]:
structure_test = Sequential()
e = Embedding(100000, 200, input_length=1380)
structure_test.add(e)
structure_test.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
structure_test.add(GlobalMaxPooling1D())
structure_test.summary()

In [0]:
model_cnn_01 = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=1380, trainable=False)
model_cnn_01.add(e)
model_cnn_01.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_01.add(GlobalMaxPooling1D())
model_cnn_01.add(Dense(256, activation='relu'))
model_cnn_01.add(Dense(1, activation='sigmoid'))
model_cnn_01.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn_01.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
print(model_cnn_01.summary())
plot_model(model_cnn_01, show_shapes=True, to_file='model_cnn_01.png')

In [0]:
model_cnn_02 = Sequential()
e = Embedding(100000, 200, input_length=1380)
model_cnn_02.add(e)
model_cnn_02.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_02.add(GlobalMaxPooling1D())
model_cnn_02.add(Dense(256, activation='relu'))
model_cnn_02.add(Dense(1, activation='sigmoid'))
model_cnn_02.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn_02.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
print(model_cnn_02.summary())
plot_model(model_cnn_02, show_shapes=True, to_file='model_cnn_02.png')

In [0]:
model_cnn_03 = Sequential()
e = Embedding(100000, 200, weights=[embedding_matrix], input_length=1380, trainable=True)
model_cnn_03.add(e)
model_cnn_03.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn_03.add(GlobalMaxPooling1D())
model_cnn_03.add(Dense(256, activation='relu'))
model_cnn_03.add(Dense(1, activation='sigmoid'))
model_cnn_03.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn_03.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
print(model_cnn_03.summary())
plot_model(model_cnn_03, show_shapes=True, to_file='model_cnn_03.png')

In [0]:
from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model
from keras.utils.vis_utils import plot_model

MR_input = Input(shape=(1380,), dtype='int32')

tweet_encoder = Embedding(100000, 200, weights=[embedding_matrix], input_length=1380, trainable=True)(MR_input)
bigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=100, kernel_size=5, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.5)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[MR_input], outputs=[output])
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])


In [0]:
from keras.callbacks import ModelCheckpoint

filepath="CNN_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(x_train_seq, y_train, batch_size=50, epochs=5,
                     validation_data=(x_val_seq, y_validation), callbacks = [checkpoint])
model.summary()
plot_model(model, show_shapes=True, to_file='model_final.png')

In [0]:
from keras.models import load_model
loaded_CNN_model = load_model('CNN_best_weights.04-0.7000.hdf5')
loaded_CNN_model.evaluate(x=x_val_seq, y=y_validation)

Final Model Evaluation with Test Set

In [0]:
sequences_test = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(sequences_test, maxlen=1380)

In [0]:
loaded_CNN_model.evaluate(x=x_test_seq, y=y_test)