In [1]:
import numpy as np
import pandas as pd
import string
import re
import json
import spacy
sp = spacy.load('en_core_web_sm')
import tensorflow as tf
import keras.layers as layers
from sklearn.model_selection import train_test_split
from keras.models import Model
from gensim.models import Word2Vec
from gensim.models import FastText
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input,Embedding,Dense,Flatten
from sklearn.metrics import accuracy_score,classification_report

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pd.read_csv('./Data/semeval_taskA_corrected.csv', error_bad_lines=False)

In [3]:
data.head()

Unnamed: 0,Tweet index,Label,Tweet text
0,1,1,Sweet United Nations video. Just in time for C...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...
3,4,0,3 episodes left I'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...


In [4]:
def remove_puncts(data):
    new_data = re.sub(r'[^\w\s]', '', data)
    return new_data

In [5]:
def remove_nums(data):
    pattern = r'[0-9]'
    new_data = re.sub(pattern, '', data)
    return new_data

In [6]:
def clean_data(data):
    data['Tweet text'] = data['Tweet text'].str.lower()
    data['Tweet text'] = data.apply(lambda x: remove_nums(x['Tweet text']), axis=1)
    data['Tweet text'] = data.apply(lambda x: remove_puncts(x['Tweet text']), axis=1)
    return data

In [7]:
data = clean_data(data)

In [8]:
data.head()

Unnamed: 0,Tweet index,Label,Tweet text
0,1,1,sweet united nations video just in time for ch...
1,2,1,mrdahl we are rumored to have talked to ervs a...
2,3,1,hey there nice to see you minnesotand winter w...
3,4,0,episodes left im dying over here
4,5,1,i cant breathe was chosen as the most notable ...


In [9]:
def remove_stop_words(data):
    all_stopwords = sp.Defaults.stop_words
    tokens = data.split(" ")
    tokens_filtered= [word for word in tokens if not word in all_stopwords]
    return (" ").join(tokens_filtered)

In [10]:
def lemmatize(data):
    new_string = ''
    doc = sp(data)
    for token in doc:
        new_string= new_string +" "+ str(token.lemma_)
    return new_string

In [11]:
data['Tweet text'] = data.apply(lambda x: remove_stop_words(x['Tweet text']), axis=1)
data['Tweet text'] = data.apply(lambda x: lemmatize(x['Tweet text']), axis=1)

In [12]:
def remove_PRON(data):
    pattern = r'-PRON-'
    new_data = re.sub(pattern, '', data)
    return new_data

In [13]:
data['Tweet text'] = data.apply(lambda x: remove_PRON(x['Tweet text']), axis=1)

In [17]:
data.head()

Unnamed: 0,Tweet index,Label,Tweet text
0,1,1,sweet united nations video time christmas ima...
1,2,1,mrdahl rumor talk ervs agent angel ask ed esc...
2,3,1,hey nice minnesotand winter weather
3,4,0,episode leave be dying
4,5,1,can not breathe choose notable quote year ann...


In [18]:
data.shape

(3834, 3)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(data['Tweet text'], data['Label'], test_size=0.2)

In [20]:
X_test.shape

(767,)

In [21]:
X_train.shape

(3067,)

In [22]:
X_train[0:2]

1004     lloydgallagher  be mate yea good pretty shud ...
1028     photoset babaybubblez aquabreeze disgustingho...
Name: Tweet text, dtype: object

In [23]:
def prepare_data_for_word_vectors(X_train):
    sentences_as_words=[]
    word_to_index={}
    count=1
    for sent in X_train:
        temp = sent.split()
        sentences_as_words.append(temp)
    for sent in sentences_as_words:
        for word in sent:
            if word_to_index.get(word,None) is None:
                word_to_index[word] = count
                count +=1
    index_to_word = {v:k for k,v in word_to_index.items()}
    sentences=[]
    for i in range(len(sentences_as_words)):
        temp = [word_to_index[w] for w in sentences_as_words[i]]
        sentences.append(temp)


    return sentences_as_words,sentences,word_to_index

In [24]:
sentences_as_words,sentences,word_ix = prepare_data_for_word_vectors(X_train)
sentences_as_words_test, sentences_test, word_ix_test = prepare_data_for_word_vectors(X_test)

In [25]:
print(len(sentences_as_words), len(sentences_as_words_test))

3067 767


In [27]:
def building_word_vector_model(sentences):
    print("Training a word2vec model")
    model_w2v = Word2Vec(sentences=sentences, size = 100, workers = 4, window = 5)        
    print("Training complete")

    print("Training a Gensim FastText model")
    model_fasttext = FastText(sentences=sentences, size = 100, workers = 4, window = 5)        

    return model_w2v, model_fasttext

In [28]:
trainw2v, trainfast = building_word_vector_model(sentences_as_words)
test_w2v,test_fast = building_word_vector_model(sentences_as_words_test)

Training a word2vec model
Training complete
Training a Gensim FastText model
Training a word2vec model
Training complete
Training a Gensim FastText model


In [29]:
# model_wv, model_fasttext = building_word_vector_model(sentences, y_train)

In [30]:
print(trainw2v)

Word2Vec(vocab=773, size=100, alpha=0.025)


In [31]:
print(trainfast)

FastText(vocab=773, size=100, alpha=0.025)


In [32]:
print(test_w2v, test_fast)

Word2Vec(vocab=164, size=100, alpha=0.025) FastText(vocab=164, size=100, alpha=0.025)


In [36]:
def create_vectors_train(sentence_as_words) :
    max_len = 65
    vector=[]
    for ind_lst in sentences_as_words:
        temp =[]
        for text in ind_lst:
            try:
    #             len(modelw2v.wv.get_vector(text)) == 100:
                temp.append(trainw2v.wv.get_vector(text))
            except:
                temp.append(trainw2v.wv.get_vector('be'))

        for i in range(65 - len(ind_lst)) :
            temp.append(np.zeros(100))
        vector.append(temp)
    return np.array(vector)

In [51]:
def create_vectors_test_fasttext(st) :
    max_len = 65
    vector=[]
    for ind_lst in st:
        temp =[]
        for text in ind_lst:
            try:
    #             len(modelw2v.wv.get_vector(text)) == 100:
                temp.append(test_fast.wv.get_vector(text))
            except:
                temp.append(test_fast.wv.get_vector('be'))

        for i in range(65 - len(ind_lst)) :
            temp.append(np.zeros(100))
        vector.append(temp)
    return np.array(vector)

In [52]:
def create_vectors_train_fasttext(sentence_as_words) :
    max_len = 65
    vector=[]
    for ind_lst in sentences_as_words:
        temp =[]
        for text in ind_lst:
            try:
    #             len(modelw2v.wv.get_vector(text)) == 100:
                temp.append(trainfast.wv.get_vector(text))
            except:
                temp.append(trainfast.wv.get_vector('be'))

        for i in range(65 - len(ind_lst)) :
            temp.append(np.zeros(100))
        vector.append(temp)
    return np.array(vector)

In [None]:
def create_vectors_test(st) :
    max_len = 65
    vector=[]
    for ind_lst in st:
        temp =[]
        for text in ind_lst:
            try:
    #             len(modelw2v.wv.get_vector(text)) == 100:
                temp.append(test_w2v.wv.get_vector(text))
            except:
                temp.append(test_w2v.wv.get_vector('be'))

        for i in range(65 - len(ind_lst)) :
            temp.append(np.zeros(100))
        vector.append(temp)
    return np.array(vector)

In [41]:
X_train = create_vectors_train(sentences_as_words)
X_test = create_vectors_test(sentences_as_words_test)

In [53]:
x_train = create_vectors_train_fasttext(sentences_as_words)
x_test = create_vectors_test_fasttext(sentences_as_words_test)

In [54]:
x_train.shape

(3067, 65, 100)

In [42]:
X_train.shape

(3067, 65, 100)

In [43]:
X_test.shape

(767, 65, 100)

In [44]:
def classification_model(X_train):
    input_seq = Input(shape=(65,100))
    x = Dense(128,activation ="relu")(input_seq)
    x = Flatten()(x)
    preds = Dense(1,activation="sigmoid")(x)

    model = Model(input_seq,preds)

    return model

In [45]:
model = classification_model(X_train)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [46]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 65, 100)           0         
_________________________________________________________________
dense_1 (Dense)              (None, 65, 128)           12928     
_________________________________________________________________
flatten_1 (Flatten)          (None, 8320)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 8321      
Total params: 21,249
Trainable params: 21,249
Non-trainable params: 0
_________________________________________________________________


In [48]:
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test)) # word2vec

Train on 3067 samples, validate on 767 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1899daf60>

In [56]:
model_fast = model = classification_model(x_train)
model_fast.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [58]:
model_fast.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 65, 100)           0         
_________________________________________________________________
dense_5 (Dense)              (None, 65, 128)           12928     
_________________________________________________________________
flatten_3 (Flatten)          (None, 8320)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 8321      
Total params: 21,249
Trainable params: 21,249
Non-trainable params: 0
_________________________________________________________________


In [59]:
model_fast.fit(x_train, y_train, epochs=50, batch_size=32, validation_data=(x_test, y_test)) #fasttetxt

Train on 3067 samples, validate on 767 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x18d3a09e8>