In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os 
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Library

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import plotly.express as px

import re
import string
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import RepeatedStratifiedKFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
import pickle

from wordcloud import WordCloud

# Load Dataset

In [None]:
submit = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
X = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
print(X.shape)
print(test.shape)
X.head(6)

In [None]:
Y = X["target"]
X.isnull().sum()

# Visualization

In [None]:
print(Y.value_counts().to_frame())
sns.barplot(data=Y.value_counts().to_frame(), x=Y.value_counts().to_frame().index, y="target")

In [None]:
sorting = pd.DataFrame()
for key in list(X['keyword'].value_counts()[:25].index):
    sorting = sorting.append(X[X['keyword'] == key])

In [None]:
sorting

In [None]:
fig = px.histogram(sorting, x='keyword', color = 'target', marginal = "box", hover_data = sorting.columns, title = '언급 횟수 상위 25개의 keyword',
                   color_discrete_map={"0": "RebeccaPurple", "1": "MediumPurple"})
fig.show()

치명적 홍수 아마겟 침몰 피해 신체에 피해 % 20 가방 피난 공포 충돌 사이렌 트위스터 풍랑 싱크홀 침몰 지옥불 무기 기근 폭발 회오리 지진 탈선

In [None]:
contain_disaster = X[X['text'].str.contains("disaster") == True]

fig = plt.figure(figsize = (6,6))
plt.pie(contain_disaster['target'].value_counts(), labels=contain_disaster['target'].value_counts().index, autopct='%.1f%%')
plt.title("Target for Disaster contained Text")
plt.show()

https://www.kaggle.com/code/tahimakhatun/natural-language-processing-with-disaster-tweets

# Text Cleaning & Tokenization

In [None]:
stop_words = stopwords.words('english')
def stem(content):
    title_stem = re.sub(r"\bhttp"," ",content)
    title_stem = re.sub(r"[^a-zA-Z]", " ", title_stem)
    title_stem = re.sub("\d"," ",title_stem)
    title_stem = re.sub("\W"," ",title_stem)
    title_stem = title_stem.lower()
    title_stem = title_stem.translate(str.maketrans("","", string.punctuation))
    title_stem = title_stem.split()
    title_stem = [word for word in title_stem if not word in stop_words]
    title_stem = ' '.join(filter(str.isalpha, title_stem))
    return title_stem

In [None]:
X['text'] = X['text'].apply(stem)
test['text'] = test['text'].apply(stem)

In [None]:
X['words']=[len(x.split()) for x in X['text'].tolist()]
test['words']=[len(x.split()) for x in test['text'].tolist()]

In [None]:
disaster_tweets = X[X['target']==1]['text']
non_disaster_tweets = X[X['target']==0]['text']


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[16, 8])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(disaster_tweets))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Disaster Tweets',fontsize=40);

wordcloud2 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(non_disaster_tweets))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Non Disaster Tweets',fontsize=40);

In [None]:
from collections import Counter
count= Counter()
def count_word(text):
    for x in text.values:
        for word in x.split():
            count[word]+=1
    return count
counter= count_word(X.text)

In [None]:
counter.most_common(5)

# Data split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X['text'], Y, train_size=0.8, random_state=42, shuffle = True, stratify=Y)

In [None]:
train_data= tf.data.Dataset.from_tensor_slices((X_train.to_numpy(), y_train.to_numpy()))
val_data= tf.data.Dataset.from_tensor_slices((X_val.to_numpy(), y_val.to_numpy()))

test_data = tf.data.Dataset.from_tensor_slices((test['text'].to_numpy()))

In [None]:
for text,label in train_data.take(3):
    print('Text: ',text.numpy())
    print('Label: ',label.numpy())

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_data = train_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_data = val_data.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

test_data = test_data.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Text Vectorization

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vocab_size= 20000
max_len= 15
vectorize_layer = TextVectorization(
    max_tokens=vocab_size + 2,
    split="whitespace",
    output_mode="int", #tf-idf / int / binary / count
    output_sequence_length=max_len,
)

In [None]:
vectorize_layer.adapt(train_data.map(lambda text, label: text))
vectorize_layer.adapt(val_data.map(lambda text, label: text))

In [None]:
X.head()

# Creat Model & Training

In [None]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(input_dim= vocab_size+1, output_dim=max_len,mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, activation='tanh',return_sequences=True)),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.LSTM(64,return_sequences=True),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1,activation='sigmoid')
    ])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
print(model.summary())

In [None]:
tf.keras.utils.plot_model(model,show_shapes=True)

In [None]:
history = model.fit(train_data, epochs=40,
                    validation_data= val_data,
                    validation_steps=50,batch_size=64)

# Accuracy & Predict

In [None]:
print('\nAccuracy : {:.4f}'.format(model.evaluate(X_val, y_val)[1]))

In [None]:
y_loss = history.history['loss']

x_len = np.arange(len(y_loss))
plt.plot(x_len, y_loss, c = 'blue', markersize = 3,label = 'Trainset_loss')
plt.legend(loc = 'upper right')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()


y_acc = history.history['accuracy']

x_len = np.arange(len(y_acc))
plt.plot(x_len, y_acc, c = 'blue', markersize = 3,label = 'Trainset_accuracy')
plt.legend(loc = 'lower right')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()

In [None]:
predict = model.predict(test_data)
prediction = []
for pre in predict:
    if pre >= 0.5:
        prediction.append(1)
    else:
        prediction.append(0)

In [None]:
submit['target'] = prediction
submit