In [161]:
import numpy as np
import pandas as pd
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from sklearn.model_selection import train_test_split
from keras.regularizers import l2
np.random.seed(1)

In [162]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [163]:
def read_glove_vecs(glove_file):                           #function to read glove vectors
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [164]:
raw_data1 = pd.read_csv("train.csv")
train = raw_data1.copy()
train.describe(include='all')

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


In [165]:
train.iloc[7612]

id                                                      10873
keyword                                                   NaN
location                                                  NaN
text        The Latest: More Homes Razed by Northern Calif...
target                                                      1
Name: 7612, dtype: object

In [166]:
raw_data2 = pd.read_csv('test.csv')
test = raw_data2.copy()
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [167]:
data = pd.concat([train,test])

In [168]:
data['text'] = data['text'].str.lower()

In [169]:
data['text'] = data['text'].str.split()
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[our, deeds, are, the, reason, of, this, #eart...",1.0
1,4,,,"[forest, fire, near, la, ronge, sask., canada]",1.0
2,5,,,"[all, residents, asked, to, 'shelter, in, plac...",1.0
3,6,,,"[13,000, people, receive, #wildfires, evacuati...",1.0
4,7,,,"[just, got, sent, this, photo, from, ruby, #al...",1.0


In [171]:
def fun(sentence_words):                               #removing punctuations
    punctuations= list("?:!.,;#[({]})/@!$%^&*''""")
    s = []
    for word in sentence_words:
        w = list(word)
        t = []
        for x in w:
            if x not in punctuations:
                t.append(x)
        y = "".join(t)
        s.append(wordnet_lemmatizer.lemmatize(y,pos='v'))              #lemmetize the given sentences
    return (s)
data['text'] = data['text'].apply(fun)

In [172]:
data = data.drop(['location', 'keyword'], axis=1)

In [173]:
train = data.iloc[:7613]
test = data.iloc[7613:]
test = test.drop(['target'], axis=1)
data_X = train.drop(['target','id'],axis=1)
data_Y = train['target']

In [174]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_Y, test_size=0.1)

In [175]:
X_train = X_train['text'].to_numpy()
X_test = X_test['text'].to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [176]:
X_train[8]

['rt',
 'roadid',
 'thank',
 'to',
 'alex',
 'for',
 'his',
 'story',
 'amp',
 'to',
 'all',
 'first',
 'responders',
 'for',
 'be',
 'there',
 'when',
 'we',
 'need',
 'you',
 '\x89û_',
 'httptcohikdc1fm2f']

In [178]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

In [179]:
maxlen = len(max(data['text'], key=len))

In [206]:
def sentences_to_indices(X, word_to_index, max_len):
    
    m = X.shape[0]                                   
    X_indices = np.zeros((m,max_len))
    
    for i in range(m):                              
        sentence_words = [w.lower() for w in X[i]   
        j = 0
        for w in sentence_words:
            if w in word_to_index.keys():
                X_indices[i, j] = word_to_index[w]
            j += 1
            
    
    return X_indices

In [207]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):      #Create keras embedding layer
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    emb_matrix = np.zeros((vocab_len, emb_dim))
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [226]:
def twitter(input_shape, word_to_vec_map, word_to_index):       #Function creating twitter model's graph

    # Define sentence_indices as the input of the graph.
    # It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
    sentence_indices = Input(input_shape, dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors 
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    #pass input to embedding layer
    embeddings = embedding_layer(sentence_indices) 
    #pass through the fisrt LSTM model
    X = LSTM(units=128, return_sequences=True,kernel_regularizer=l2(0.00005))(embeddings)
    X = Dropout(rate=0.5)(X)
    #second LSTM layer
    X = LSTM(units=128, return_sequences=False,kernel_regularizer=l2(0.0005))(X)
    X = Dropout(rate=0.5)(X)
    # Hidden layer
    X = Dense(1)(X)
    X = Activation(activation='sigmoid')(X)
    model = Model(inputs = sentence_indices, outputs = X)
    
    
    return model

In [227]:
model = twitter((maxlen,), word_to_vec_map, word_to_index)
model.summary()

Model: "model_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_20 (InputLayer)        [(None, 31)]              0         
_________________________________________________________________
embedding_19 (Embedding)     (None, 31, 50)            20000050  
_________________________________________________________________
lstm_36 (LSTM)               (None, 31, 128)           91648     
_________________________________________________________________
dropout_36 (Dropout)         (None, 31, 128)           0         
_________________________________________________________________
lstm_37 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_37 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 129

In [228]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [229]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxlen)

In [230]:
model.fit(X_train_indices, y_train, epochs = 30, batch_size = 256, shuffle=True)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x144ad0c50>

In [225]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxlen)
loss, acc = model.evaluate(X_test_indices, y_test)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.7979002594947815


In [145]:
X_test_indices = sentences_to_indices(test['text'], word_to_index, max_len = maxlen)

In [146]:
pred = model.predict(X_test_indices)

In [147]:
pred = np.rint(pred).astype(int)

In [148]:
test['target'] = pred

In [149]:
test1 = test.drop(['text'], axis=1)

In [154]:
np.sum(test1['target'])

1081

In [155]:
test.to_csv('submission3.csv', index=False)