# Binary Classification (w/ TFIDF + XGBOOST)
Define the problem as classification between less toxic and more toxic.

## 1. Load libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack

## 2. Load Datasets

In [None]:
train_df = pd.read_csv(f'../input/jigsaw-toxic-severity-rating/{filenames[1]}')

In [None]:
test_df = pd.read_csv(f'../input/jigsaw-toxic-severity-rating/{filenames[2]}')

## 3. Preprocessing

### I. Labeling data: more toxic as 1 and less toxic as 0

In [None]:
less_df = pd.DataFrame([[v, 0] for v in train_df['less_toxic'].values], columns=['text', 'label'])
more_df = pd.DataFrame([[v, 1] for v in train_df['more_toxic'].values], columns=['text', 'label'])
train_labeled_df = less_df.append(more_df).sample(frac=1)

In [None]:
all_text = pd.concat([train_labeled_df['text'], test_df['text']])

### II. TFIDF 1: word analyzer

In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features=15000
)

In [None]:
word_vectorizer.fit(all_text)
train_word_X = word_vectorizer.transform(train_labeled_df['text'])
test_word_X = word_vectorizer.transform(test_df['text'])

### III. TFIDF 2: char analyzer

In [None]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 6),
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features=50000
)

In [None]:
char_vectorizer.fit(all_text)
train_char_X = char_vectorizer.transform(train_labeled_df['text'])
test_char_X = char_vectorizer.transform(test_df['text'])

### IV. TFIDF 3: char_wb analyzer

In [None]:
vec3 = TfidfVectorizer(analyzer='char_wb', max_df=0.5, min_df=3, ngram_range=(4, 6))

In [None]:
vec3.fit(all_text)
train_X3 = vec3.transform(train_labeled_df['text'])
test_X3 = vec3.transform(test_df['text'])

### V. concat all training TFIDF data

In [None]:
X_train_concat = hstack([train_char_X, train_word_X, train_X3])
X_test_concat = hstack([test_char_X, test_word_X, test_X3])

## 4. Modeling

In [None]:
params = {
    'objective': 'binary:logistic',
     'tree_method': 'gpu_hist',
     'scale_pos_weight': 1,
     'eval_metric': 'auc',
     'subsample': 0.8,
     'colsample_bytree': 0.8,
     'verbosity': 2,
     'max_depth': 9,
     'min_child_weight': 7,
     'eta': 0.2
}

In [None]:
x, x_val, y, y_val = train_test_split(
    X_train_concat, train_labeled_df['label'], test_size=0.2, stratify=train_labeled_df['label'])
d_train = xgb.DMatrix(
    x, label=y
)
d_valid = xgb.DMatrix(
    x_val, label=y_val
)
watchlist = [(d_train, "train"), (d_valid, "valid")]
bst = xgb.train(params, d_train, evals=watchlist,
                num_boost_round=1000, early_stopping_rounds=50)

In [None]:
d_test = xgb.DMatrix(X_test_concat)

In [None]:
preds = bst.predict(d_test)

## 5. Submission

In [None]:
sub_df = pd.DataFrame()
sub_df['comment_id'] = test_df['comment_id']
sub_df['score'] = preds
sub_df['score'] = sub_df['score'].rank(method='first')

In [None]:
sub_df.to_csv("submission.csv", index=False)

# **Please, upvote if it was helpful!!!**