# CNN implementation for Tweets Classification

As this notebook uses tensorflow and needs a lot of computation power, it was ran on Google Colab and we advice you to do the same.

You can find the GloVE embedding here : https://nlp.stanford.edu/projects/glove/

## Imports

In [None]:
import numpy as np

import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  #raise SystemError('GPU device not found')
  print('No GPU Found')
    
import re

from sklearn.model_selection import train_test_split             
from keras.preprocessing.text import Tokenizer                    
from keras.preprocessing.sequence import pad_sequences
from keras import layers
from keras.layers import Embedding
from keras.models import Sequential
from keras.models import Model
from keras.callbacks import ModelCheckpoint

## Helpers

In [None]:
def clean_kim_yoon(filename, unique):
    """ Clean txt files using a similar algorithm as kim yoon's one 
    (see https://github.com/yoonkim/CNN_sentence)
    
    Removes duplicates if unique is set to True.
    Splits known contractions. Ex : do -> do n't
    """
    new_filename = filename.replace(".txt", "_clean_kim_yoon.txt")

    prev_f = open(filename, "r")
    new_f = open(new_filename, "w+")

    if(unique):
        tweets = list(set(prev_f.readlines()))
    else:
        tweets = prev_f.readlines()

    for t in tweets:
        t1 = re.sub(r"\'s", " \'s", t) 
        t1 = re.sub(r"\'ve", " \'ve", t1) 
        t1 = re.sub(r"n\'t", " n\'t", t1) 
        t1 = re.sub(r"\'re", " \'re", t1) 
        t1 = re.sub(r"\'d", " \'d", t1) 
        t1 = re.sub(r"\'ll", " \'ll", t1)
        t1 = re.sub(r"\'m", " \'m", t1)
        new_f.write(t1)

    prev_f.close()
    new_f.close()
    
def tweets_txt(file_name):
    """Parse a file and return an array of tweets"""
    tweets_txt = []
    f = open(file_name, "r")
    for l in f.readlines():
        tweets_txt.append(l.strip())
    f.close()
    return np.array(tweets_txt)
    
def remove_not_in_words_list(tweets, words_list):
    """Remove from tweets all tokens that are not in words_list"""
    reduced_tweets = []
    for t in tweets:
        t_words = t.split(" ")
        new_t = [w for w in t_words if np.any(words_list == w)]
        reduced_tweets.append(" ".join(new_t))
    return reduced_tweets

def max_tweet_length(tweets):
    """Compute the maximum number of tokens of a set of tweets"""
    max_length = 0
    for t in tweets:
        max_length = max(max_length, len(t))

    return max_length

## Cleaning

In [None]:
clean_kim_yoon("Datasets/twitter-datasets/train_neg_full.txt", True)
clean_kim_yoon("Datasets/twitter-datasets/train_pos_full.txt", True)
clean_kim_yoon("Datasets/twitter-datasets/test_data.txt", False)

## Load embedding

In [None]:
f = open("glove_from_stanford/glove.twitter.27B.200d.txt", "r")
words = []

i = 0
embeddings = []
for l in f.readlines():
    li = l.split()
    w = li[0]
    vec_string = li[1:]
    vec = []
    for e in vec_string:
        vec.append(float(e))
    vec = np.array(vec)
    if i%10000 == 0:
        print("done: ", i )
    if vec.shape[0] == 200:
        words.append(w)
        embeddings.append(vec)

    else:
        print(w, " was not the right shape. The shape was: ", vec.shape)
    i += 1
    
embedding_stacked = np.stack(embeddings, axis=0)
words = np.array(words)

np.save("embedding_stanford.npy", embedding_stacked)
np.save("words_stanford.npy", words)

## Reduce embedding

Before running this code, you should apply build_vocab.sh & cut_vocab.sh to the cleaned txt files that contain the tweets.

In [None]:
f = open("vocab_cut_clean_kim_yoon.txt", 'r')
words = []
for l in f.readlines():
    words.append(l[:-1])
words = np.array(words)
f.close()
np.save("words_full_list_clean_kim_yoon.npy", words)

In [None]:
embedding_stanford = np.load("embedding_stanford.npy")
word_list_stanford = np.load("words_stanford.npy")

words_list_full_dataset = np.load("words_full_list_clean_kim_yoon.npy")

In [None]:
words_needed = np.isin(word_list_stanford, words_list_full_dataset)

In [None]:
word_wanted_indices = np.nonzero(words_needed*1.)[0]

In [None]:
reduced_embedding_kim_yoon = embedding_stanford[word_wanted_indices]
reduced_words_kim_yoon = word_list_stanford[word_wanted_indices]

In [None]:
np.save("reduced_embedding_kim_yoon.npy", reduced_embedding_kim_yoon)
np.save("reduced_words_kim_yoon.npy", reduced_words_kim_yoon)

## Reduce tweets depending on new embedding

In [None]:
tweets_pos = tweets_txt("Datasets/twitter-datasets/train_pos_full_clean_kim_yoon.txt")
tweets_neg = tweets_txt("Datasets/twitter-datasets/train_neg_full_clean_kim_yoon.txt")

tweets_test= tweets_txt("Datasets/twitter-datasets/test_data_clean_kim_yoon.txt")

In [None]:
tweets_pos_reduced = remove_not_in_words_list(tweets_pos, reduced_words_kim_yoon)
np.save("Datasets/twitter-datasets/reduced_full_tweets_pos.npy", tweets_pos_reduced)
tweets_neg_reduced = remove_not_in_words_list(tweets_neg, reduced_words_kim_yoon)
np.save("Datasets/twitter-datasets/reduced_full_tweets_neg.npy", tweets_neg_reduced)
tweets_test_reduced = remove_not_in_words_list(tweets_test, reduced_words_kim_yoon)
np.save("Datasets/twitter-datasets/reduced_tweets_test.npy", tweets_test_reduced)

## Prepare training and validation set

### Preparation for every method except method 2

In [None]:
tweets_pos = np.load("Datasets/twitter-datasets/reduced_full_tweets_pos.npy")
tweets_neg = np.load("Datasets/twitter-datasets/reduced_full_tweets_neg.npy")

max_l1 = max_tweet_length(tweets_pos)
max_l2 = max_tweet_length(tweets_neg)

maxlen = max(max_l1, max_l2)

tweets = np.concatenate((tweets_pos, tweets_neg))
y = np.concatenate((np.ones((tweets_pos.shape[0])), np.zeros((tweets_neg.shape[0]))))

tweets_train, tweets_test, y_train, y_test = train_test_split(
                                                tweets, y,  
                                                test_size=0.25,  
                                                random_state=42)

tokenizer = Tokenizer(num_words=reduced_words_kim_yoon.shape[0])
tokenizer.fit_on_texts(tweets_train)

X_train_full = tokenizer.texts_to_sequences(tweets_train)
X_test_full = tokenizer.texts_to_sequences(tweets_test)

X_train_full = pad_sequences(X_train_full, padding='post', maxlen=maxlen)
X_test_full = pad_sequences(X_test_full, padding='post', maxlen=maxlen)

### Preparation for method 2

In [None]:
tweets_pos = np.load("Datasets/twitter-datasets/reduced_full_tweets_pos.npy")
tweets_neg = np.load("Datasets/twitter-datasets/reduced_full_tweets_neg.npy")

tweets = np.concatenate((tweets_pos, tweets_neg))
positive_labels = [[0, 1] for _ in tweets_pos]
negative_labels = [[1, 0] for _ in tweets_neg]
y_2 = np.concatenate([positive_labels, negative_labels], 0)

tweets_train, tweets_test, y_train, y_test = train_test_split(
                                                tweets, y_2,  
                                                test_size=0.25,  
                                                random_state=42)

tokenizer = Tokenizer(num_words=reduced_words_kim_yoon.shape[0])
tokenizer.fit_on_texts(tweets_train)

X_train_full = tokenizer.texts_to_sequences(tweets_train)
X_test_full = tokenizer.texts_to_sequences(tweets_test)                        

max_l1 = max_tweet_length(tweets_pos)
max_l2 = max_tweet_length(tweets_neg)

maxlen = max(max_l1, max_l2)

X_train_full = pad_sequences(X_train_full, padding='post', maxlen=maxlen)
X_test_full = pad_sequences(X_test_full, padding='post', maxlen=maxlen)

## Method 1
Adapted from : https://medium.com/saarthi-ai/sentence-classification-using-convolutional-neural-networks-ddad72c7048c

In [None]:
model = Sequential()
model.add(layers.Embedding(reduced_embedding_kim_yoon.shape[0],
                            reduced_embedding_kim_yoon.shape[1],
                            weights=[reduced_embedding_kim_yoon],
                            input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train_full, y_train,
                    epochs=10,
                    validation_data=(X_test_full, y_test))

### Prediction and submission for Model 1

In [None]:
tweets_test = np.load("Datasets/twitter-datasets/reduced_tweets_test.npy")
tweets_test = tokenizer.texts_to_sequences(tweets_test)
tweets_test = pad_sequences(tweets_test, padding='post', maxlen=maxlen)
y = model.predict_classes(tweets_test)

# make csv
with open("submission.csv", "w") as f:
    f.write("Id,Prediction\n")
    id = 1
    for i in y:
        if i == 0:
            i = -1
        if i == 1:
            i = 1
        l = str(id) + "," + str(i) + "\n"
        f.write(l)
        id = id + 1

## Method 2
Adapted from : https://github.com/bhaveshoswal/CNN-text-classification-keras/blob/master/data_helpers.py

In [None]:
embedding_dim = reduced_embedding_kim_yoon.shape[1]
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

inputs = layers.Input(shape=(maxlen,))
embedding = layers.Embedding(reduced_embedding_kim_yoon.shape[0],
                            embedding_dim,
                            weights=[reduced_embedding_kim_yoon],
                            input_length=maxlen)(inputs)
reshape = layers.Reshape((maxlen,embedding_dim,1))(embedding)

conv_0 = layers.Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = layers.Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = layers.Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = layers.MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = layers.MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = layers.MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = layers.Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = layers.Flatten()(concatenated_tensor)
dropout = layers.Dropout(drop)(flatten)
output = layers.Dense(2, activation='softmax')(dropout)

model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights.{epoch:02d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train_full, y_train,
                    epochs=10, callbacks=[checkpoint],
                    validation_data=(X_test_full, y_test))

### Prediction and submission for model 2

In [None]:
tweets_test = np.load("Datasets/twitter-datasets/reduced_tweets_test.npy")
tweets_test = tokenizer.texts_to_sequences(tweets_test)
tweets_test = pad_sequences(tweets_test, padding='post', maxlen=maxlen)
y = model.predict_classes(tweets_test)

# make csv
with open("submission.csv", "w") as f:
    f.write("Id,Prediction\n")
    id = 1
    for i in y:
        if i == 0:
            i = -1
        if i == 1:
            i = 1
        l = str(id) + "," + str(i) + "\n"
        f.write(l)
        id = id + 1

## Method 3
Adapted from : https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

### Model without dropouts
Best one

In [None]:
model = Sequential()
model.add(layers.Embedding(reduced_embedding_kim_yoon.shape[0],
                            reduced_embedding_kim_yoon.shape[1],
                            weights=[reduced_embedding_kim_yoon],
                            input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### Model with one dropout layer

In [None]:
dropout = 0.1

model = Sequential()
model.add(layers.Embedding(reduced_embedding_kim_yoon.shape[0],
                            reduced_embedding_kim_yoon.shape[1],
                            weights=[reduced_embedding_kim_yoon],
                            input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### Model with two dropout layers

In [None]:
dropout = 0.1

model = Sequential()
model.add(layers.Embedding(reduced_embedding_kim_yoon.shape[0],
                            reduced_embedding_kim_yoon.shape[1],
                            weights=[reduced_embedding_kim_yoon],
                            input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### Model with 3 dropout layers

In [None]:
dropout = 0.1

model = Sequential()
model.add(layers.Embedding(reduced_embedding_kim_yoon.shape[0],
                            reduced_embedding_kim_yoon.shape[1],
                            weights=[reduced_embedding_kim_yoon],
                            input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### Model with a dense layer of 200

In [None]:
model = Sequential()
model.add(layers.Embedding(reduced_embedding_kim_yoon.shape[0],
                            reduced_embedding_kim_yoon.shape[1],
                            weights=[reduced_embedding_kim_yoon],
                            input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(200, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### Model with filters=256 in the first Conv1D layer

In [None]:
model = Sequential()
model.add(layers.Embedding(reduced_embedding_kim_yoon.shape[0],
                            reduced_embedding_kim_yoon.shape[1],
                            weights=[reduced_embedding_kim_yoon],
                            input_length=maxlen))
model.add(layers.Conv1D(256, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### Model with filters=256 in the first Conv1D layer and 1000 in the first dense layer

In [None]:
model = Sequential()
model.add(layers.Embedding(reduced_embedding_kim_yoon.shape[0],
                            reduced_embedding_kim_yoon.shape[1],
                            weights=[reduced_embedding_kim_yoon],
                            input_length=maxlen))
model.add(layers.Conv1D(256, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(1000, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### Model with filters=512 in the first Conv1D layer and 1000 in the first dense layer

In [None]:
model = Sequential()
model.add(layers.Embedding(reduced_embedding_kim_yoon.shape[0],
                            reduced_embedding_kim_yoon.shape[1],
                            weights=[reduced_embedding_kim_yoon],
                            input_length=maxlen))
model.add(layers.Conv1D(512, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(1000, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

### Compile and run the model of method 3

In [None]:
checkpoint = ModelCheckpoint('weights.{epoch:02d}-{val_acc:.4f}.hdf5', monitor = 'val_acc', verbose = 1, save_best_only=True, mode='auto')

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train_full, y_train,
                    epochs=10, callbacks = [checkpoint], verbose =2,
                    validation_data=(X_test_full, y_test))

### Prediction and submission for model 3

In [None]:
tweets_test = np.load("Datasets/twitter-datasets/reduced_tweets_test.npy")
tweets_test = tokenizer.texts_to_sequences(tweets_test)
tweets_test = pad_sequences(tweets_test, padding='post', maxlen=maxlen)
y = model.predict_classes(tweets_test)

# make csv
with open("submission.csv", "w") as f:
    f.write("Id,Prediction\n")
    id = 1
    for i in y:
        if i == 0:
            i = -1
        if i == 1:
            i = 1
        l = str(id) + "," + str(i) + "\n"
        f.write(l)
        id = id + 1