In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt

In [6]:
#Reading datasets
df_model = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_oot = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [7]:
#Data cleaning and null value treatments
df_model.fillna(value={'keyword':"NotGiven", 'location':"NotGivenLocation"}, inplace=True)
df_oot.fillna(value={'keyword':"NotGiven", 'location':"NotGivenLocation"}, inplace=True)
#Removing unwanted special characters from keyword, location, text columns
df_model['keyword'] = df_model['keyword'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x))
df_oot['keyword'] = df_oot['keyword'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x))

df_model['location'] = df_model['location'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x)).str.strip()
df_oot['location'] = df_oot['location'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x)).str.strip()

df_model['text'] = df_model['text'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x))
df_oot['text'] = df_oot['text'].apply(lambda x: re.sub(r"[^a-zA-Z]+", ' ', x))

### Split into train-test

In [8]:
df_train, df_test = train_test_split(df_model,test_size=0.25,random_state=124)

## TFIDF

In [9]:
corpus = df_train['text'].tolist()

In [10]:
#Importing stop_words list from nltk
stop_words = stopwords.words('english')

In [11]:
#Defining tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words=stop_words,max_features=2000)

In [12]:
#Fitting vocabulary into vectorizer
x_train_tf = vectorizer.fit_transform(corpus).toarray()
x_test_tf = vectorizer.transform(df_test['text'].tolist()).toarray()

In [13]:
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = 2500)

In [14]:
oe.fit(df_train[['keyword', 'location']])

In [15]:
x_train_nt = oe.transform(df_train[['keyword', 'location']])
x_test_nt = oe.transform(df_test[['keyword', 'location']])

In [16]:
x_train = np.hstack([x_train_tf, x_train_nt])
x_test = np.hstack([x_test_tf, x_test_nt])

In [17]:
y_train = np.array(df_train['target'])
y_test = np.array(df_test['target'])

In [18]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

### Modeling

In [19]:
from catboost import CatBoostClassifier

In [20]:
model = CatBoostClassifier()

In [21]:
# Fitting our train data in model
model.fit(x_train, y_train)

In [22]:
y_pred = model.predict(x_test)

In [23]:
print(classification_report(y_test,y_pred))

### Predicion on OOT data

In [24]:
x_oot_tf = vectorizer.transform(df_oot['text'].tolist()).toarray()
x_oot_nt = oe.transform(df_oot[['keyword', 'location']])
x_oot = np.hstack([x_oot_tf, x_oot_nt])

In [25]:
y_pred_oot = model.predict(x_oot)

In [26]:
df_oot['target'] = y_pred_oot

In [27]:
df_out = df_oot[['id','target']]

In [29]:
df_out.to_csv("oot_prediction.csv", index=False)