In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Embedding
from tensorflow.keras import preprocessing
from tensorflow.keras.utils import plot_model
import matplotlib.pyplot as plt
import seaborn as sns

# 1.  Read the Data

In [None]:
df_train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df_test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
df_train_test = pd.concat([df_train_data, df_test_data])

# 2. Encoder the text by Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = 100000)
tokenizer.fit_on_texts(df_train_test['text'])

sequences = tokenizer.texts_to_sequences(df_train_test['text'])
np_sequences = np.array(sequences)
np_sequences = preprocessing.sequence.pad_sequences(np_sequences, maxlen = 50)

In [None]:
np_sequences.shape

# 3. Encoder keyword, location

In [None]:
from sklearn.preprocessing import LabelEncoder
keyword_location = df_train_test[['keyword','location']]
keyword_location = keyword_location.fillna('No')

In [None]:
#
Normalize_location = LabelEncoder()
keyword_location['location'] =Normalize_location.fit_transform(keyword_location['location'])

In [None]:
def split_keyword(word):
    return word.split('%')[0]

keyword_location['keyword'] = keyword_location['keyword'].apply(split_keyword)
Normalize_keyword = LabelEncoder()
keyword_location['keyword'] =Normalize_keyword.fit_transform(keyword_location['keyword'])

# 4. Data Preprocessing

In [None]:
df_text = pd.DataFrame(np_sequences[:7613], columns = [f'max_len{x+1}' for x in range(50) ])
df_key_loc = pd.DataFrame(keyword_location[:7613], columns = ['keyword','location'])

df_train = pd.concat((df_text, df_key_loc), axis=1)

In [None]:
df_train['target'] = df_train_data['target']

In [None]:
df_train.head()

In [None]:
from sklearn.model_selection import train_test_split
target = df_train.target
data = df_train.drop(columns = 'target')
x_train, x_test, y_train, y_test = train_test_split(data, target, train_size = 0.8)

In [None]:
x_train = np.array(x_train)
x_test = np.array(x_test)

# 5. Model Buliding (text's features + key_loc feature)

## 5-1. test's feature -> LSTM

In [None]:
from tensorflow.keras import Model, layers, Input

# test features training
text_input = Input(shape = (None, ), dtype = 'int32', name = 'text')
text_model_emb = layers.Embedding(100000, 50)(text_input)
text_model_encoded = layers.LSTM(50, name = 'text_training')(text_model_emb)

# keyword, location features training
keyword_location_input = Input(shape = (2, ), dtype = 'int32', name = 'keyword_location')
keyword_location_model_encoded = layers.Dense(64, activation = 'sigmoid', name = 'key_loc_training')(keyword_location_input)

#Merge
model_concatenated = layers.concatenate([text_model_encoded, keyword_location_model_encoded], axis = -1)
prediction = layers.Dense(1, activation = 'sigmoid', name = 'prediction')(model_concatenated)


model = Model([text_input, keyword_location_input], prediction)
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

In [None]:
plot_model(model)

In [None]:
history = model.fit([x_train[ :, :100],x_train[:, -2 :]], y_train , epochs = 4, batch_size = 32, validation_split = 0.2)

In [None]:
sns.set()
df_DL = pd.DataFrame(history.history)
df_DL.head()
plt.plot(df_DL.index, df_DL['loss'], label = 'loss')
plt.plot(df_DL.index, df_DL['val_loss'], label = 'Val_loss')
plt.xlabel( 'Epochs')
plt.ylabel('Binary_crossentropy')
plt.title('loss Function')
plt.legend()

In [None]:
sns.set()
plt.plot(df_DL.index, df_DL['accuracy'], label = 'accuracy')
plt.plot(df_DL.index, df_DL['val_accuracy'], label = 'Val_accuracy')
plt.xlabel( 'Epochs')
plt.ylabel('Accuracy')
plt.title('DL Accuracy process')
plt.legend()

# 6. Test_data Preprocessing

In [None]:
df_test_text = pd.DataFrame(np_sequences[-3263:], columns = [f'max_len{x+1}' for x in range(50) ])
df_test_key_loc = pd.DataFrame(keyword_location[-3263:], columns = ['keyword','location'])

df_test = pd.concat((df_test_text, df_test_key_loc), axis=1)

In [None]:
df_test.head()

# 7. Prediction -> Submission

In [None]:
df_test.shape

In [None]:
df_test = np.array(df_test)
y_pred = model.predict([df_test[ :, :100],df_test[:, -2 :]])

In [None]:
y_pred.astype('int')

In [None]:
sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
sub['target'] = y_pred.round(0).astype('int')
sub.to_csv('submission.csv', index=False)

In [None]:
sub