In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt

In [3]:
#Reading datasets
df_model = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_oot = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [4]:
#Data cleaning and null value treatments
df_model.fillna(value={'keyword':"NotGiven", 'location':"NotGivenLocation"}, inplace=True)
df_oot.fillna(value={'keyword':"NotGiven", 'location':"NotGivenLocation"}, inplace=True)
#Removing unwanted special characters from keyword, text columns
df_model['keyword'] = df_model['keyword'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x))
df_oot['keyword'] = df_oot['keyword'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x))

df_model['location'] = df_model['location'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x)).str.strip()
df_oot['location'] = df_oot['location'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x)).str.strip()

df_model['text'] = df_model['text'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x))
df_oot['text'] = df_oot['text'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x))

### Split into train-test

In [5]:
df_train, df_test = train_test_split(df_model,test_size=0.25,random_state=124)

## TFIDF

In [6]:
corpus = df_train['text'].tolist()

In [7]:
#Importing stop_words list from nltk
stop_words = stopwords.words('english')

In [8]:
#Defining tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=stop_words,max_features=2000)

In [9]:
#Fitting vocabulary into vectorizer
x_train_tf = vectorizer.fit_transform(corpus).toarray()
x_test_tf = vectorizer.transform(df_test['text'].tolist()).toarray()

In [10]:
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = 2500)

In [11]:
oe.fit(df_train[['keyword', 'location']])

In [12]:
x_train_nt = oe.transform(df_train[['keyword', 'location']])
x_test_nt = oe.transform(df_test[['keyword', 'location']])

In [13]:
y_train = np.array(df_train['target'])
y_test = np.array(df_test['target'])

In [50]:
x_train = np.hstack([x_train_tf, x_train_nt])
x_test = np.hstack([x_test_tf, x_test_nt])

In [15]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

### Modeling

In [18]:
from catboost import CatBoostClassifier

In [19]:
#{'depth': 10, 'iterations': 1000, 'learning_rate': 0.01}
model = CatBoostClassifier()

In [20]:
# Fitting our train data in model
model.fit(x_train, y_train)

In [21]:
y_pred = model.predict(x_test)

In [22]:
print(classification_report(y_test,y_pred))

In [16]:
x_oot_tf = vectorizer.transform(df_oot['text'].tolist()).toarray()
x_oot_nt = oe.transform(df_oot[['keyword', 'location']])
x_oot = np.hstack([x_oot_tf, x_oot_nt])

### Deep learning 

In [17]:
import tensorflow as tf

In [18]:
Y_train = tf.keras.utils.to_categorical(y_train, num_classes=None)
Y_test = tf.keras.utils.to_categorical(y_test, num_classes=None)

In [53]:
optimizer = tf.keras.optimizers.Adamax(learning_rate=0.00005)
initializer = tf.keras.initializers.HeNormal()
reduceLR =tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss",factor=0.1,patience=5,min_lr=0)
es = tf.keras.callbacks.EarlyStopping(monitor="val_loss",patience=5, restore_best_weights=True)

In [54]:
input1=tf.keras.Input(shape=(2002,))

x1 = tf.keras.layers.Dense(250, kernel_initializer=initializer, activation='relu')(input1)
x1 = tf.keras.layers.Dense(250, kernel_initializer=initializer, activation='relu')(x1)

x1 = tf.keras.layers.Dense(2,kernel_initializer=initializer, activation='softmax')(x1)
model = tf.keras.Model(inputs=[input1], outputs=x1)
model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['accuracy'])

In [55]:
out = model.fit(x_train, Y_train, validation_data=(x_test,Y_test),batch_size=64,epochs=500, callbacks=[reduceLR, es])

In [52]:
plt.plot(out.history['loss'])
plt.plot(out.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
y_predict=model.predict(X_test)
Y_pred = y_predict.argmax(axis=-1)

In [26]:
y_pred_oot = model.predict(x_oot)

In [27]:
df_oot['target'] = y_pred_oot

In [28]:
df_submission = df_oot[['id','target']]

In [29]:
df_submission.to_csv("submission3.csv", index=False)