## Context

In this notebook, I tried to figure out how many features I have to deal with, how they can be reduced without losing the quality of predicts.

#### See 3. Check TfidfVectorizer parameters

```
%%time
features = make_pipeline(
    preprocessor,
    TfidfVectorizer(decode_error = "ignore",
                    # analyzer{'word', 'char', 'char_wb'} or callable, default='word'
                    analyzer = "char_wb",
                    # default ngram_range=(1,1)
                    ngram_range = (3,5),
                    # default max_df=1.0 (float)
                    max_df = 1.0,
                    # default min_df=1 (int)
                    min_df = 0.0003,
                    # default max_features=None
                    max_features = None
    )
).fit_transform(X)

features  # 159571 rows x XXXXXX features in sparse matrix
```

To check the quality of predicts, I used the Ridge model, it is fast enough to have enough time to check several parameters.

#### See 4. Use GridSearchCV to get the best estimator

```
pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('tfidf', TfidfVectorizer(decode_error='ignore',
                              analyzer="char_wb",
                              ngram_range=(2,5),
                              max_features=None)
    ),
    ('model', Ridge())
])
```


I tried other options (for example, LinearRegression), but there was not enough time for cross-validation.

V9, V8 was cancelled after __ hours...

## Clarification

For train data, I used earlier discovered weights for earlier predicted data.

```
data_config = {
    'columns': ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack'],
    'weigths': [None, 0.6, 0.1, 0.5, 0.3, 0.1]
}
```

```
def get_train_data(data: pd.DataFrame, config: dict) -> pd.DataFrame:
    """ Predict the toxicity rate of the text. """    
    columns_list = config.get('columns')
    weigths_list = config.get('weigths')

    result = pd.DataFrame({
        'text': data['text'],
        'score': (data[columns_list] * weigths_list).median(axis=1)
    }, index=data.index)
    
    return result
```

## Sources

Custom Dataset for Jigsaw Rate Toxic Comments Competition 2021

https://www.kaggle.com/renokan/dataset-jigsaw-comments

Utility Scripts

https://www.kaggle.com/renokan/toxic-comments-utilities



# 1. Import & Def & Set & Load

In [None]:
import numpy as np
import pandas as pd

from collections.abc import Callable

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor

from sklearn.exceptions import FitFailedWarning

import warnings
warnings.filterwarnings("ignore", category=FitFailedWarning)

import toxic_comments_utilities as tc

In [None]:
def get_train_data(data: pd.DataFrame, config: dict) -> pd.DataFrame:
    """ Predict the toxicity rate of the text. """    
    columns_list = config.get('columns')
    weigths_list = config.get('weigths')

    result = pd.DataFrame({
        'text': data['text'],
        'score': (data[columns_list] * weigths_list).median(axis=1)
    }, index=data.index)
    
    return result


def get_score(estimator: Callable, data: pd.DataFrame) -> float:
    """ Score an estimator on the validation data. """
    data = data.copy()
    
    data['less_toxic'] = estimator.predict(data['less_toxic'])
    data['more_toxic'] = estimator.predict(data['more_toxic'])
    
    score = data.eval('less_toxic < more_toxic').mean()
    
    return round(score, 4)


def get_submission(estimator: Callable, data: pd.DataFrame) -> pd.DataFrame:
    """ Get predicted toxicity scores to submit results. """
    data = data.copy()
    
    data['text'] = estimator.predict(data['text'])
    
    return data.rename(columns={'text':'score'})

In [None]:
pd.set_option('display.max_colwidth', 80)
pd.set_option("display.precision", 8)

data_config = {
    'columns': ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack'],
    'weigths': [None, 0.6, 0.1, 0.5, 0.3, 0.1]
}

In [None]:
comments_to_score_path = "../input/jigsaw-toxic-severity-rating/comments_to_score.csv"
validation_data_path = "../input/jigsaw-toxic-severity-rating/validation_data.csv"
score_data = pd.read_csv(comments_to_score_path)
valid_data = pd.read_csv(validation_data_path)

train_data = get_train_data(
    pd.read_csv("../input/dataset-jigsaw-comments/predict_train_data.csv",
                index_col='index'),
    data_config
)

stopwords = tc.get_stopwords()

In [None]:
%whos DataFrame

In [None]:
%whos dict or list or bool or int or float

In [None]:
train_data.head()

In [None]:
X = train_data['text']
y = train_data['score']

# 2. Create & Set preprocessor

In [None]:
preprocessor = FunctionTransformer(
    tc.text_preprocessor, kw_args={
        'max_str_len': None,
        'stop_words': stopwords,
        'stemmer': False
    }
)

In [None]:
# >>> tc.text_preprocessor(X)
# Wall time: 1min 59s
#
tc.text_preprocessor(X.head())

In [None]:
# >>> tc.text_preprocessor(X, stop_words=stopwords, stemmer=True)
# Wall time: 5min 4s
#
tc.text_preprocessor(X.head(), max_str_len=70,
                     stop_words=stopwords, stemmer=True)

# 3. Check TfidfVectorizer parameters

In [None]:
# %%time
# features = make_pipeline(
#     preprocessor,
#     TfidfVectorizer(decode_error = "ignore",
#                     # analyzer{'word', 'char', 'char_wb'} or callable, default='word'
#                     analyzer = "char_wb",
#                     # default ngram_range=(1,1)
#                     ngram_range = (3,5),
#                     # default max_df=1.0 (float)
#                     max_df = 1.0,
#                     # default min_df=1 (int)
#                     min_df = 0.0003,
#                     # default max_features=None
#                     max_features = None
#     )
# ).fit_transform(X)

# features  # 159571 rows x XXXXXX features in sparse matrix

### An important note for myself

> About using TfidfVectorizer() for extracting features  
> After used tc.text_preprocessor(X)

```
>>> TfidfVectorizer()
(159571, 162843)

>>> TfidfVectorizer(ngram_range=(1,2))
(159571, 2303260)

>>> TfidfVectorizer(analyzer='char_wb',
                    ngram_range=(1,2))
(159571, 6300)

>>> TfidfVectorizer(ngram_range=(1,3))
(159571, 7780855)

>>> TfidfVectorizer(analyzer='char_wb',
                    ngram_range=(1,3))
(159571, 39176)

>>> TfidfVectorizer(ngram_range=(2,3))
(159571, 7618012)

>>> TfidfVectorizer(analyzer='char_wb',
                    ngram_range=(2,3))
(159571, 38516)

>>> TfidfVectorizer(ngram_range=(2,5))
(159571, 23089917)

>>> TfidfVectorizer(analyzer='char_wb',
                    ngram_range=(2,5))
(159571, 555452)

>>> TfidfVectorizer(ngram_range=(3,5))
(159571, 20949500)

>>> TfidfVectorizer(analyzer='char_wb',
                    ngram_range=(3,5))
(159571, 549812)
```

# 4. Use GridSearchCV to get the best estimator

In [None]:
pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('tfidf', TfidfVectorizer(decode_error='ignore',
                              analyzer="char_wb",
                              ngram_range=(2,5),
                              max_features=None)
    ),
    ('model', Ridge())
])

parameters = {
    'tfidf__min_df': [0.00008, 0.0001, 0.0003]
}

In [None]:
# analyzer    = "char_wb"
# ngram_range = (2,5) or (3,5)
# min_df      = 0.000XX
#
# 0.00003 > 159571x162843
# 0.00005 > 159571x123577
# 0.00008 > 159571x94391
# 0.0001  > 159571x84654
# 0.0003  > 159571x48087 or 47167

In [None]:
%%time
search = GridSearchCV(estimator=pipeline,
                      param_grid=parameters,
                      scoring=None,
                      cv=5, verbose=3)

search.fit(X, y)
search.score(X, y)

In [None]:
best_estimator = search.best_estimator_

In [None]:
best_estimator.named_steps['tfidf']

In [None]:
best_estimator.named_steps['model']

In [None]:
%%time
get_score(best_estimator, valid_data)

# 5. Create & Save submission

In [None]:
%%time
submission = get_submission(best_estimator, score_data)
submission.to_csv("submission.csv", index=False)

In [None]:
submission