# Kaggle - Disaster Twitter - NLP Classification

```
Date:  2021-07-26
Source: https://www.kaggle.com/fanglidayan/4-nlp-disaster-tweets/output
Model: 
- tensorflow.keras 

Features:
- text
- hashtag
- keyword
- word embedding

Libraries:
- tensorflow.keras
- nltk
- spaCy
- tweet-preprocessor
- re
- numpy
- pandas

Takeaways:
- new feature engineering based on the hashtag
- word embedding using spaCy en_core_web_lg pre-trained package
- the word vector is stored in a numpy array (7613, 23, 300)
- looks like the GRU layers are used with two middle-layer 
``` 


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

.\Disaster_Twitter_Yan.ipynb
.\preprocess_test.csv
.\preprocess_train.csv
.\sample_submission.csv
.\store_test.npy
.\store_train.npy
.\submission_nlp_tweets.csv
.\test.csv
.\train.csv
.\.ipynb_checkpoints\Disaster_Twitter_Yan-checkpoint.ipynb


In [3]:
train=pd.read_csv('train.csv',index_col = 'id')
test=pd.read_csv('test.csv',index_col = 'id')

In [329]:
def isNaN(string):
    return string != string
isNaN('ablaze')

False

### lower case

In [330]:
train['text']=train['text'].apply(lambda x : x.lower())
test['text']=test['text'].apply(lambda x : x.lower())

In [331]:
train['keyword'].apply(lambda x: isNaN(x))

id
1        True
4        True
5        True
6        True
7        True
         ... 
10869    True
10870    True
10871    True
10872    True
10873    True
Name: keyword, Length: 7613, dtype: bool

In [332]:
def lower_keywords(keywords):
    if keywords == keywords:
        keywords=keywords.lower()
    return keywords

In [333]:
train['keyword']=train['keyword'].apply(lambda x : lower_keywords(x))
test['keyword']=test['keyword'].apply(lambda x : lower_keywords(x))

In [334]:
train.loc[48:50]

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
48,ablaze,Birmingham,@bbcmtd wholesale markets ablaze http://t.co/l...,1
49,ablaze,Est. September 2012 - Bristol,we always try to bring the heavy. #metal #rt h...,0
50,ablaze,AFRICA,#africanbaze: breaking news:nigeria flag set a...,1


### extract hashtag and create new feature column

In [335]:
import re
train['hashtag'] = train['text'].apply(lambda x: re.findall(r'#(\w+)', x))
test['hashtag'] = test['text'].apply(lambda x: re.findall(r'#(\w+)', x))
# train['keyword'] = train['keyword'].apply(lambda x: re.findall(r'#(\w+)', x) if not isNaN(x) else [])
# test['keyword'] = test['keyword'].apply(lambda x: re.findall(r'#(\w+)', x) if not isNaN(x) else [])


In [336]:
train.loc[48:50]

Unnamed: 0_level_0,keyword,location,text,target,hashtag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
48,ablaze,Birmingham,@bbcmtd wholesale markets ablaze http://t.co/l...,1,[]
49,ablaze,Est. September 2012 - Bristol,we always try to bring the heavy. #metal #rt h...,0,"[metal, rt]"
50,ablaze,AFRICA,#africanbaze: breaking news:nigeria flag set a...,1,[africanbaze]


###  remove digits (01234), urls (http://...), mentions (@...) and hashtags (#...)

In [220]:
!pip install tweet-preprocessor
import preprocessor



In [221]:
help(preprocessor)

Help on package preprocessor:

NAME
    preprocessor

PACKAGE CONTENTS
    api
    defines
    enum
    parse
    preprocess
    utils

DATA
    __all__ = [<function clean>, <function tokenize>, <function parse>, <f...

FILE
    c:\users\sherry.gow\anaconda3\lib\site-packages\preprocessor\__init__.py




In [337]:
train['text'] = train['text'].apply(lambda x: preprocessor.clean(x))
test['text'] = test['text'].apply(lambda x: preprocessor.clean(x))

In [338]:
train.loc[0:2]['keyword'].apply(lambda x: preprocessor.clean(x) if not isNaN(x) else [])

id
1    []
Name: keyword, dtype: object

In [339]:
def clear_list(lista):
    try:
        for i,ele in enumerate(lista):
            lista[i]=preprocessor.clean(ele)
        return lista
    except:
        print(lista)

In [340]:
train['hashtag'] = train['hashtag'].apply(lambda x: clear_list(x) if x is not None else [])
test['hashtag'] = test['hashtag'].apply(lambda x: clear_list(x)  if x is not None else [])


In [341]:
train.loc[196:200]

Unnamed: 0_level_0,keyword,location,text,target,hashtag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
196,airplane%20accident,"19.600858, -99.047821",experts in france begin examining airplane deb...,1,[news]
197,airplane%20accident,Pennsylvania,strict liability in the context of an airplane...,1,[]
198,airplane%20accident,"Salt Lake City, Utah",your lifetime odds of dying from an airplane a...,0,[]
199,airplane%20accident,"Palo Alto, CA",experts in france begin examining airplane deb...,1,[]


In [131]:
clear_list(['here is a #hashtag', 'https://google.com is the web ui', 'this is special %435 how'])

['here is a', 'is the web ui', 'this is special %435 how']

###  recover abbreviations (change they'll to they will, etc)

In [342]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

train['text'] = train['text'].apply(lambda x: decontracted(x))
test['text'] = test['text'].apply(lambda x: decontracted(x))

### remove punctuations

In [343]:
def remove_punc(lista):
    for i,ele in enumerate(lista):
        lista[i] = re.sub(r'[^\w\s]', '', ele)
        lista[i] = re.sub('_', ' ', lista[i]) # the previous row doesn't remove underscore
    return lista


In [344]:
train['text']=train['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
train['text']=train['text'].apply(lambda x: re.sub('_', ' ', x))

test['text']=test['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
test['text']=test['text'].apply(lambda x: re.sub('_', ' ', x)) # the previous row doesn't remove 



In [345]:
train[48:60]

Unnamed: 0_level_0,keyword,location,text,target,hashtag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
68,ablaze,Live On Webcam,check these out,0,[nsfw]
71,ablaze,England.,first night with retainers in it is quite weir...,0,[]
73,ablaze,"Sheffield Township, Ohio",deputies man shot before brighton home set ablaze,1,[]
74,ablaze,India,man wife get six years jail for setting ablaze...,1,[]
76,ablaze,Barbados,santa cruz head of the st elizabeth police sup...,0,[]
77,ablaze,Anaheim,police arsonist deliberately set black church ...,1,[]
78,ablaze,Abuja,noches elbestia happy to see my teammates and...,0,[]
79,ablaze,USA,trampling on turkmen flag later set it ablaze ...,1,"[kurds, diyala]"
80,ablaze,South Africa,truck ablaze r21 voortrekker ave outside or t...,1,[]
81,ablaze,"Sao Paulo, Brazil",set our hearts ablaze and every city was a gif...,0,[]


In [346]:
train['hashtag']=train['hashtag'].apply(lambda x: remove_punc(x))
test['hashtag']=test['hashtag'].apply(lambda x: remove_punc(x))


### remove digits from keywords

In [357]:
train['keyword']=train['keyword'].apply(lambda x: re.sub(r'[^\D]', ' ', x) if not isNaN(x) else '')
test['keyword']=test['keyword'].apply(lambda x: re.sub(r'[^\D]', ' ', x)if not isNaN(x) else '')


In [358]:
train.loc[196:200]

Unnamed: 0_level_0,keyword,location,text,target,hashtag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
196,airplane accident,"19.600858, -99.047821",experts in france begin examining airplane deb...,1,[news]
197,airplane accident,Pennsylvania,strict liability in the context of an airplane...,1,[]
198,airplane accident,"Salt Lake City, Utah",your lifetime odds of dying from an airplane a...,0,[]
199,airplane accident,"Palo Alto, CA",experts in france begin examining airplane deb...,1,[]


In [356]:
re.sub(r'[^\D]', ' ', 'air20balx')

'air  balx'

### tokenize

In [69]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sherry.gow\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [359]:
train['text']=train['text'].apply(lambda x: nltk.word_tokenize(x))
test['text']=test['text'].apply(lambda x: nltk.word_tokenize(x))

In [360]:
train['keyword']=train['keyword'].apply(lambda x: nltk.word_tokenize(x))
test['keyword']=test['keyword'].apply(lambda x: nltk.word_tokenize(x))

In [361]:
train.loc[196:200]

Unnamed: 0_level_0,keyword,location,text,target,hashtag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
196,"[airplane, accident]","19.600858, -99.047821","[experts, in, france, begin, examining, airpla...",1,[news]
197,"[airplane, accident]",Pennsylvania,"[strict, liability, in, the, context, of, an, ...",1,[]
198,"[airplane, accident]","Salt Lake City, Utah","[your, lifetime, odds, of, dying, from, an, ai...",0,[]
199,"[airplane, accident]","Palo Alto, CA","[experts, in, france, begin, examining, airpla...",1,[]


###  remove stopwords

In [365]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sherry.gow\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [366]:
stop_words=stopwords.words('english')
stop_words.append('u') # 'i love u' is the semantically the same as 'i love you'
stop_words.append('one') # want to remove numbers
stop_words.append('two')
stop_words.append('three')
stop_words.append('four')
stop_words.append('five')
stop_words.append('six')
stop_words.append('seven')
stop_words.append('eight')
stop_words.append('nine')
stop_words.append('ten')

In [367]:
def remove_stop_words(lista):
    return [i for i in lista if i not in stop_words]

In [368]:
lst =['text', 'hashtag', 'keyword']
for l in lst:
    train[l]=train[l].apply(lambda x:  remove_stop_words(x) if x is not None else [] )
    test[l]=test[l].apply(lambda x: remove_stop_words(x) if x is not None else [])
    



Unnamed: 0_level_0,keyword,location,text,hashtag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,[],,"[happened, terrible, car, crash]",[]
2,[],,"[heard, different, cities, stay, safe, everyone]",[earthquake]
3,[],,"[forest, fire, spot, pond, geese, fleeing, acr...",[]
9,[],,"[apocalypse, lighting]","[spokane, wildfires]"
11,[],,"[typhoon, soudelor, kills, china, taiwan]",[]
...,...,...,...,...
10861,[],,"[earthquake, safety, los, angeles, safety, fas...",[]
10865,[],,"[storm, ri, worse, last, hurricane, cityamp3ot...",[]
10868,[],,"[green, line, derailment, chicago]",[]
10874,[],,"[meg, issues, hazardous, weather, outlook, hwo]",[]


In [369]:
train.loc[196:200]

Unnamed: 0_level_0,keyword,location,text,target,hashtag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
196,"[airplane, accident]","19.600858, -99.047821","[experts, france, begin, examining, airplane, ...",1,[news]
197,"[airplane, accident]",Pennsylvania,"[strict, liability, context, airplane, acciden...",1,[]
198,"[airplane, accident]","Salt Lake City, Utah","[lifetime, odds, dying, airplane, accident]",0,[]
199,"[airplane, accident]","Palo Alto, CA","[experts, france, begin, examining, airplane, ...",1,[]


### lemmatization. ('us' is lemmatized to 'u')

In [97]:
from nltk import WordNetLemmatizer 
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sherry.gow\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [370]:
def lemmatize_list(lista):
    return [WordNetLemmatizer().lemmatize(i) for i in lista ]

In [371]:
for l in lst:
    train[l] = train[l].apply(lambda x: lemmatize_list(x) if x is not None else [])

### save preprocess file

In [372]:
train.to_csv('preprocess_train.csv', index=False)
test.to_csv('preprocess_test.csv', index=False)

### Find max text, hashtag and keyword length

In [373]:
max_len_train =[]
max_len_test=[]
for l in lst:
    max_len_train.append(train[l].apply(lambda x: len(x)).max())
    max_len_test.append(test[l].apply(lambda x: len(x)).max())
    

In [374]:
max_len_train

[23, 13, 2]

### install and understand word embedding

In [137]:
# !pip3 install spacy
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz (782.7 MB)
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py): started
  Building wheel for en-core-web-lg (setup.py): still running...
  Building wheel for en-core-web-lg (setup.py): finished with status 'done'
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.3.1-py3-none-any.whl size=782936124 sha256=b218d0332c3a45346f68bccce960c21dceee305e0e120b9d95d212b74541ff53
  Stored in directory: c:\users\sherry.gow\appdata\local\pip\cache\wheels\8b\bb\bb\bdc918f4b37d930a1be9ed876e7b2c2ee518a34803d78a248e
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.3.1
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')


In [142]:
import en_core_web_lg
nlp = en_core_web_lg.load()

In [150]:
doc = nlp("mom dad")


(300,)

In [154]:
doc.vector

array([-6.28115013e-02,  3.05620015e-01, -3.15519989e-01,  3.11255008e-01,
        8.75004977e-02, -4.12320010e-02,  3.44820023e-01, -8.56235027e-01,
        3.60069990e-01,  2.51300001e+00, -4.11954999e-01,  3.99410009e-01,
        8.68709981e-02, -1.40648499e-01, -4.97725010e-01,  1.14992499e-01,
        7.12064952e-02,  2.59263009e-01,  1.75907493e-01,  4.31775004e-01,
       -1.37494996e-01, -6.74854994e-01,  3.62884998e-02, -3.31864990e-02,
        6.85489997e-02,  1.65477484e-01, -3.04780006e-01, -2.54519999e-01,
       -2.12589994e-01, -5.08745015e-01, -1.91990003e-01,  9.50000249e-04,
        3.19710016e-01,  5.07709011e-02,  5.62629998e-01,  9.37424973e-02,
        3.72875005e-01, -1.54286996e-01,  1.79989990e-02,  2.53154989e-02,
       -1.46919996e-01, -3.45575005e-01,  9.03474987e-02,  4.14749980e-02,
       -1.22122504e-01,  2.38005012e-01, -1.83737487e-01, -5.23784995e-01,
        6.53060019e-01,  2.85200030e-02,  9.54950005e-02, -2.43465006e-01,
        3.75999957e-02, -

In [148]:
doc[0].similarity(doc[1])

0.8566172

### use word embedding to create training set

In [383]:
# m - documents length = 7613
# n max of  - hard coded to 23 + 13 + 2 = 38
# r - hard coded to 300
# init a numpy with dimention (m,n,r) - (7613, 38, 300)
m=train.shape[0]
n = 0
for i in max_len_train:
    n = n+i    
r = 300
store_train=np.zeros((m,n,r))


In [389]:
train = train.reset_index()
test=test.reset_index()

In [393]:
for i in range(m): # m
    if i % 100 == 99:
        print(i)
    for j in range(len(train['text'][i])): # length of the list ['love','peace','compassion','wisdom']        
        store_train[i,j,:]=nlp(train['text'][i][j])[0].vector
    for j in range(len(train['hashtag'][i])):
        try:
            store_train[i,23+j,:]=nlp(train['hashtag'][i][j])[0].vector
        except:
            store_train[i,23+j,:]=nlp(train['hashtag'][i][j]).vector # in the case when hashtag is [''] instead of ['some','word']
    for j in range(len(train['keyword'][i])):
        store_train[i,36+j,:]=nlp(train['keyword'][i][j])[0].vector

99
199
299
399
499
599
699
799
899
999
1099
1199
1299
1399
1499
1599
1699
1799
1899
1999
2099
2199
2299
2399
2499
2599
2699
2799
2899
2999
3099
3199
3299
3399
3499
3599
3699
3799
3899
3999
4099
4199
4299
4399
4499
4599
4699
4799
4899
4999
5099
5199
5299
5399
5499
5599
5699
5799
5899
5999
6099
6199
6299
6399
6499
6599
6699
6799
6899
6999
7099
7199
7299
7399
7499
7599


In [394]:
np.save('store_train.npy', store_train)

In [400]:
m

7613

In [401]:
m=test.shape[0]
store_test=np.zeros((m,n,r))
for i in range(m): # m
    if i % 100 == 99:
        print(i)
    for j in range(len(test['text'][i])): # length of the list ['love','peace','compassion','wisdom']        
        store_test[i,j,:]=nlp(test['text'][i][j])[0].vector
    for j in range(len(test['hashtag'][i])):
        try:
            store_test[i,23+j,:]=nlp(test['hashtag'][i][j])[0].vector
        except:
            store_test[i,23+j,:]=nlp(test['hashtag'][i][j]).vector # in the case when hashtag is [''] instead of ['some','word']
    for j in range(len(test['keyword'][i])):
        store_test[i,36+j,:]=nlp(test['keyword'][i][j])[0].vector

99
199
299
399
499
599
699
799
899
999
1099
1199
1299
1399
1499
1599
1699
1799
1899
1999
2099
2199
2299
2399
2499
2599
2699
2799
2899
2999
3099
3199


In [402]:
np.save('store_test.npy',store_test)

### slice the train and test set

In [404]:
maxi_text = max_len_train[0]
maxi_hashtag = max_len_train[1]
maxi_keyword = max_len_train[2]


store_train_text = store_train[:, :maxi_text, :]
store_train_hashtag=store_train[:,maxi_text:maxi_text+maxi_hashtag,:]
store_train_keyword=store_train[:,-maxi_keyword:,:]

print(store_train_text.shape)
print(store_train_hashtag.shape)
print(store_train_keyword.shape)

(7613, 23, 300)
(7613, 13, 300)
(7613, 2, 300)


In [405]:
store_test_text=store_train[:,:maxi_text,:]
store_test_hashtag=store_train[:,maxi_text:maxi_text+maxi_hashtag,:]
store_test_keyword=store_train[:,-maxi_keyword:,:]

### modeling

In [407]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.5.0-cp38-cp38-win_amd64.whl (422.6 MB)
Collecting tensorboard~=2.5
  Downloading tensorboard-2.5.0-py3-none-any.whl (6.0 MB)
Collecting grpcio~=1.34.0
  Downloading grpcio-1.34.1-cp38-cp38-win_amd64.whl (2.9 MB)
Collecting flatbuffers~=1.12.0
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting tensorflow-estimator<2.6.0,>=2.5.0rc0
  Downloading tensorflow_estimator-2.5.0-py2.py3-none-any.whl (462 kB)
Collecting absl-py~=0.10
  Downloading absl_py-0.13.0-py3-none-any.whl (132 kB)
Collecting astunparse~=1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting google-pasta~=0.2
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting h5py~=3.1.0
  Downloading h5py-3.1.0-cp38-cp38-win_amd64.whl (2.7 MB)
Collecting opt-einsum~=3.3.0
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting keras-nightly~=2.5.0.dev
  Downloading keras_nightly-2.5.0.dev2021032900-py2.py3-none-any.whl (

In [408]:
!pip install keras

Collecting keras
  Downloading Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Installing collected packages: keras
Successfully installed keras-2.4.3


In [434]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
# from tensorflow.keras.layers import Input, Dropout, GRU, BatchNormalization, TimeDistributed, Reshapre, Dense, Conv1D, Concatenate
import tensorflow.keras.models as M 
import tensorflow.keras

In [435]:
inp=Input(shape=(store_train.shape[1],store_train_text.shape[2]))

mid=L.GRU(units=300, return_sequences=True)(inp)
mid=L.Dropout(0.6)(mid)
mid=L.BatchNormalization()(mid)  

In [436]:
mid=L.GRU(units=300, return_sequences=True)(mid)
mid=L.Dropout(0.6)(mid)
mid=L.BatchNormalization()(mid)  

In [437]:
mid=L.GRU(units=300, return_sequences=True)(mid)
mid=L.Dropout(0.6)(mid)
mid=L.BatchNormalization()(mid)  

In [438]:
mid=L.Dropout(0.6)(mid)
mid=L.TimeDistributed(L.Dense(1,activation='relu'))(mid)
mid=L.Reshape((mid.shape[1],))(mid)
mid=L.Dropout(0.6)(mid)
mid=L.BatchNormalization()(mid) 
outp=L.Dense(2,activation='softmax')(mid)

In [441]:
model=M.Model(inputs=inp, outputs=outp) 

In [442]:
model

<tensorflow.python.keras.engine.functional.Functional at 0x1d714618790>

In [440]:
help(tf.keras.models)

Help on package tensorflow.keras.models in tensorflow.keras:

NAME
    tensorflow.keras.models - Code for model cloning, plus model-related API entries.

PACKAGE CONTENTS


FILE
    c:\users\sherry.gow\anaconda3\lib\site-packages\tensorflow\keras\models\__init__.py




### create labels for the training sets

In [443]:
store_train=np.load('store_train.npy')

m=store_train.shape[0]
train_Y=np.zeros((m,2))
for i in range(m):
    train_Y[i,train.iloc[i]['target']]=1

In [451]:
train_Y[1,1]

1.0

In [452]:
sed=13
np.random.seed(sed)
np.random.shuffle(store_train)
np.random.seed(sed)
np.random.shuffle(train_Y)

In [454]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy',metrics='accuracy')

In [455]:
model.fit(store_train[0:-500,:,:], train_Y[0:-500,:], batch_size=64, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1d7145caeb0>

### evaluate the cross validation set

In [456]:
model.evaluate(store_train[-500:,:,:], train_Y[-500:,:])



[1.1093380451202393, 0.7979999780654907]

### test set

In [457]:
test_Y=model.predict(store_test)

test_label=[]

for i in range(test_Y.shape[0]):
    if test_Y[i,1]>=0.5:
        test_label.append(1)
    else:
        test_label.append(0)

In [458]:
submission=pd.DataFrame({'id': test['id'], 'target':test_label})
print(submission.head(10))

filename = 'submission_nlp_tweets.csv'

submission.to_csv(filename,index=False)

   id  target
0   0       1
1   2       0
2   3       1
3   9       0
4  11       1
5  12       1
6  21       0
7  22       0
8  27       0
9  29       0


# test

In [234]:
train.loc[48:55]

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
48,ablaze,Birmingham,@bbcmtd wholesale markets ablaze http://t.co/l...,1
49,ablaze,Est. September 2012 - Bristol,we always try to bring the heavy. #metal #rt h...,0
50,ablaze,AFRICA,#africanbaze: breaking news:nigeria flag set a...,1
52,ablaze,"Philadelphia, PA",crying out for more! set me ablaze,0
53,ablaze,"London, UK",on plus side look at the sky last night it was...,0
54,ablaze,Pretoria,@phdsquares #mufc they've built so much hype a...,0
55,ablaze,World Wide!!,inec office in abia set ablaze - http://t.co/3...,1


In [46]:
test['text'][1]

'heard about is different cities, stay safe everyone.'

In [53]:
train['text'][15]

'what is up man?'

In [67]:
remove_punc(["how are you?", "what's up"])

['how are you', 'whats up']

In [70]:
train['text'][15]

['what', 'is', 'up', 'man']

In [100]:
lemmatize_list(remove_stop_words (nltk.word_tokenize  ('this is a stopword removal ten')))

['stopword', 'removal']