## prework

* import dependencies and load library
* load data
* check data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# basic dependencies
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns

import re
from tqdm import tqdm
import string

In [None]:
# load data
tweet = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
tweet.shape, test.shape

In [None]:
# check data
def show_info(data, is_matrix_transpose=False):
    # basic shape
    print('data shape is: {}   sample number {}   attribute number {}\n'.format(data.shape, data.shape[0], data.shape[1]))
    # attribute(key)
    print('data columns number {}  \nall columns: {}\n'.format(len(data.columns) ,data.columns))
    # value's null
    print('data all attribute count null:\n', data.isna().sum())
    # data value analysis and data demo
    if is_matrix_transpose:
        print('data value analysis: ', data.describe().T)
        print('data demo without matrix transpose: ', data.head().T)
    else:
        print('data value analysis: ', data.describe())
        print('data demo without matrix transpose: ', data.head())
    print('\n' * 5)
        
show_info(tweet)
show_info(test)

## EDA

* target classification
* value analysis
    * split => number of total-words, per-word length
    * corpus (both target0 and target1)
    * punctuations
    * some common words
* NGram analysis [n ≥ 2, 用来查看不同种类的属性之间的关系]

In [None]:
from collections import defaultdict, Counter
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
# use for n-gram analysis
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# plot and check => target classification
tweet.target.value_counts().plot.bar()

In [None]:
# split and number of words
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
tweet_len = tweet[tweet['target'] == 1]['text'].str.split().map(lambda x: len(x))
ax1.hist(tweet_len, color='red')
ax1.set_title('disaster tweets')
tweet_len = tweet[tweet['target'] == 0]['text'].str.split().map(lambda x: len(x))
ax2.hist(tweet_len, color='green')
ax2.set_title('Not disaster tweets')
fig.suptitle('Words in a tweet')
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
word = tweet[tweet['target'] == 1]['text'].str.split().apply(lambda x: [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)), ax=ax1, color='red')
ax1.set_title('disaster')
word = tweet[tweet['target'] == 0]['text'].str.split().apply(lambda x: [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)), ax=ax2, color='green')
ax2.set_title('Not disaster')
fig.suptitle('Average word length in each tweet')

In [None]:
def create_corpus(target):
    corpus = []
    for x in tweet[tweet['target'] == target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
# get target 0 and target 1 's corpus and then analysis'
corpus = create_corpus(0)
dic = defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word] += 1
top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]
x, y = zip(*top)
ax1.bar(x, y)
corpus = create_corpus(1)

dic = defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word] += 1
top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]
x, y = zip(*top)
ax2.bar(x, y, color='green')

In [None]:
# punctuations
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

corpus = create_corpus(1)
dic = defaultdict(int)
special = string.punctuation
for i in (corpus):
    if i in special:
        dic[i] += 1
x, y = zip(*dic.items())
ax1.bar(x, y)

corpus = create_corpus(0)
dic = defaultdict(int)
special = string.punctuation
for i in (corpus):
    if i in special:
        dic[i] += 1
x, y = zip(*dic.items())
ax2.bar(x, y, color='green')

In [None]:
plt.figure(figsize=(10, 5))


In [None]:
# common words
counter = Counter(corpus)
most = counter.most_common()
x = []
y = []
for word, count in most[:40]:
    if (word not in stop):
        x.append(word)
        y.append(count)
sns.barplot(x=y, y=x)

In [None]:
# n-gram
def get_top_tweet_bigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

plt.figure(figsize=(10, 5))
top_tweet_bigrams = get_top_tweet_bigrams(tweet['text'])[:10]
x, y = map(list, zip(*top_tweet_bigrams))
sns.barplot(x=y, y=x)

## data cleaning

* concat and overall conversion
* remove
    * remove urls
    * remove html tags
    * remove emojis
    * remove punctuations
* correct some spelling error

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

In [None]:
# concat
df = pd.concat([tweet, test])
df.shape

In [None]:
# remove url
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

# remove html tags
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

# remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# remove punctuations
def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

# remove
df['text'] = df['text'].apply(lambda x: remove_URL(x))
df['text'] = df['text'].apply(lambda x: remove_html(x))
df['text'] = df['text'].apply(lambda x: remove_emojis(x))
df['text'] = df['text'].apply(lambda x: remove_punct(x))

In [None]:
# correct
spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return ''.join(corrected_text)

# correct
# %time df['text'] = df['text'].apply(lambda x: correct_spellings(x))

## Feature

* Glove for Vectorization


In [None]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [None]:
# def create_corpus(df):
#     corpus = []
#     for tweet in tqdm(df['text']):
#         words = [word.lower() for word in word_tokenize(tweet) if ((word.isalpha() == 1) & (word not in stop))]
#         corpus.append(words)
#     return corpus

def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus
corpus = create_corpus(df)

In [None]:
embedding_dict = {}
with open('/kaggle/input/nlp-with-disaster-tweets-glove/glove.6B.100d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vectors
f.close()

## Model

* parameter
* build model
* train, test split


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.initializers import Constant
from keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [None]:
# parameters
MAX_LEN = 50
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences = tokenizer_obj.texts_to_sequences(corpus)

tweet_pad = pad_sequences(sequences, maxlen=MAX_LEN, truncating='post', padding='post')

In [None]:
word_index = tokenizer_obj.word_index
print('Number of unique words', len(word_index))

In [None]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 100))

for word, i in tqdm(word_index.items()):
    if i > num_words:
        continue
        
    emb_vec = embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec

In [None]:
# build model
model = Sequential()

embedding = Embedding(num_words, 100, embeddings_initializer=Constant(embedding_matrix), input_length=MAX_LEN, trainable=False)
model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=1e-5)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

In [None]:
train=tweet_pad[:tweet.shape[0]]
test=tweet_pad[tweet.shape[0]:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, tweet['target'].values, test_size=0.15)
X_train.shape, X_test.shape

In [None]:
history = model.fit(X_train, y_train, batch_size=4, epochs=15, validation_data=(X_test, y_test), verbose=2)

In [None]:
# submit
sample_sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
pred = model.predict(test)
pred = np.round(pred).astype(int).reshape(3263)
submission = pd.DataFrame({'id': sample_sub['id'].values.tolist(), 'target':pred})
submission.to_csv('submission.csv', index=False)