## lightGBM+Ridge

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from tqdm import tqdm
from time import time
from contextlib import contextmanager
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.pipeline import Pipeline
import re
from bs4 import BeautifulSoup
import gc

import fasttext
import fasttext.util

import warnings
warnings.simplefilter('ignore')

In [None]:
class CFG:
    train_path = "../input/jigsaw-toxic-comment-classification-challenge/train.csv"
    valid_path = "../input/jigsaw-toxic-severity-rating/validation_data.csv"
    test_path = "../input/jigsaw-toxic-severity-rating/comments_to_score.csv"
    submission_path = "../input/jigsaw-toxic-severity-rating/sample_submission.csv"
    seed = 71
    fold = 5
    params = {
        "n_estimators": 20000,
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": 0.01,
        "num_leaves": 1023,
        "n_jobs": -1,
        "importance_type": "gain",
        "colsample_bytree": .8,
        "colsample_bynode": .5,
        "max_depth": 7,
    }

In [None]:
train_df = pd.read_csv(CFG.train_path)
valid_df = pd.read_csv(CFG.valid_path)
test_df = pd.read_csv(CFG.test_path)
submission_df = pd.read_csv(CFG.submission_path)

### drop_duplicates text

In [None]:
train_text = train_df[["id", "comment_text"]]

valid_text_l = pd.DataFrame()
valid_text_l["comment_text"] = valid_df["less_toxic"]
valid_text_l["id"] = "less"

valid_text_m = pd.DataFrame()
valid_text_m["comment_text"] = valid_df["more_toxic"]
valid_text_m["id"] = "more"

valid_text = pd.concat([valid_text_m, valid_text_l],axis=0)

check_df = pd.concat([train_text, valid_text], axis=0)

In [None]:
# drop_id
drop_id = check_df[(check_df["comment_text"].duplicated(keep=False))&(check_df["id"]!="less")&(check_df["id"]!="more")]["id"].unique()
train_df = train_df[~train_df["id"].isin(drop_id)]

In [None]:
# check
from matplotlib_venn import venn2
venn2([set(train_df['comment_text']), set(valid_df['more_toxic'])], set_labels=('train', 'test_more'));

In [None]:
venn2([set(train_df['comment_text']), set(valid_df['less_toxic'])], set_labels=('train', 'test_less'));

In [None]:
del check_df
del train_text
del valid_text
del valid_text_l
del valid_text_m
gc.collect()

## Create y

In [None]:
cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in cat_mtpl:
    train_df[category] = train_df[category] * cat_mtpl[category]
    
train_df["score"] = train_df.loc[:, "toxic": "identity_hate"].sum(axis=1)
train_df["y"] = train_df["score"]

min_len = (train_df["y"]>=0.1).sum()
y0_undersample = train_df[train_df["y"]==0].sample(n=min_len, random_state=2021)
new_train_df = pd.concat([train_df[train_df["y"]>=0.1], y0_undersample])

new_train_df = new_train_df[["id", "comment_text", "y"]].reset_index(drop=True)

In [None]:
new_train_df.head()

In [None]:
del train_df
gc.collect()

## Preprocess

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
class BaseBlock(object):
    def fit(self, input_df, y=None):
        return self.transform(input_df)
    
    def transform(self, input_df):
        raise NotImplementedError()

        
class TfidfBlock(BaseBlock):
    def __init__(self, column: str, whole_df: pd.DataFrame, decomposition: str, n_compose: int):
        self.column = column
        self.whole_df = whole_df
        self.decomposition = decomposition
        self.n_compose = n_compose

    def fit(self, input_df, y=None):
        master_df = self.whole_df
        text = self.whole_df[self.column].fillna("")

        if self.decomposition == "svd":
            self.pipeline_ = Pipeline([
                ("tfidf", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5), max_features=50000)),
                ("svd", TruncatedSVD(n_components=self.n_compose, random_state=71))
            ])

        elif self.decomposition == "NMF":
            self.pipeline_ = Pipeline([
                ("tfidf", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5), max_features=50000)),
                ("NMF", NMF(n_components=self.n_compose, random_state=71))
            ])
        elif self.decomposition == "LDA":
            self.pipeline_ = Pipeline([
                ("tfidf", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5), max_features=50000)),
                ("NMF", LatentDirichletAllocation(n_components=self.n_compose, random_state=71))
            ])
        else:
            self.pipeline_ = Pipeline([
                ("tfidf", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5), max_features=50000)),
                ("svd", TruncatedSVD(n_components=self.n_compose, random_state=71))
            ])

        self.pipeline_.fit(text)

        return self.transform(input_df)

    def transform(self, input_df):
        text = input_df[self.column].fillna("")
        z = self.pipeline_.transform(text)

        out_df = pd.DataFrame(z)
        return out_df.add_prefix(f'{self.column}_tfidf_{self.decomposition}_')
    

class StringLengthBlock(BaseBlock):
    def __init__(self, column):
        self.column = column

    def transform(self, input_df):
        output_df = pd.DataFrame()
        output_df[self.column] = input_df[self.column].str.len()
        return output_df.add_prefix("StringLength_")

    
class CountVectorizerBlock(BaseBlock):
    def __init__(self, column: str, whole_df: pd.DataFrame, decomposition: str, n_compose: int):
        self.column = column
        self.whole_df = whole_df
        self.decomposition = decomposition
        self.n_compose = n_compose

    def fit(self, input_df, y=None):
        master_df = self.whole_df
        text = self.whole_df[self.column].fillna("")

        if self.decomposition == "svd":
            self.pipeline_ = Pipeline([
                ("tfidf", CountVectorizer(min_df= 3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5), max_features=50000)),
                ("svd", TruncatedSVD(n_components=self.n_compose, random_state=71))
            ])

        elif self.decomposition == "NMF":
            self.pipeline_ = Pipeline([
                ("tfidf", CountVectorizer(min_df= 3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5), max_features=50000)),
                ("NMF", NMF(n_components=self.n_compose, random_state=71))
            ])
        elif self.decomposition == "LDA":
            self.pipeline_ = Pipeline([
                ("tfidf", CountVectorizer(min_df= 3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5), max_features=50000)),
                ("NMF", LatentDirichletAllocation(n_components=self.n_compose, random_state=71))
            ])
        else:
            self.pipeline_ = Pipeline([
                ("tfidf", CountVectorizer(min_df= 3, max_df=0.5, analyzer='char_wb', ngram_range=(3,5), max_features=50000)),
                ("svd", TruncatedSVD(n_components=self.n_compose, random_state=71))
            ])

        self.pipeline_.fit(text)

        return self.transform(input_df)

    def transform(self, input_df):
        text = input_df[self.column].fillna("")
        z = self.pipeline_.transform(text)

        out_df = pd.DataFrame(z)
        return out_df.add_prefix(f'{self.column}_CountVectorizer_{self.decomposition}_')

    
class WordCountBlock(BaseBlock):
    def __init__(self, column):
        self.column = column

    def transform(self, input_df):
        output_df = pd.DataFrame()
        output_df[self.column] = input_df[self.column].astype(str).map(lambda x: len(x.split()))
        return output_df.add_prefix("WordCount_")
    

## FastText

# No Use
def get_text_series(input_df: pd.DataFrame, column: str, sep='&'):
    out_series = None
    for i, c in enumerate(column.split(sep)):
        text_i = input_df[c].astype(str)
        if out_series is None:
            out_series = text_i
        else:
            out_series = out_series + ' ' + text_i
    return out_series

# No Use
def create_embedding(document: str, model):
    words = document.split(" ")
    x = [model.get_word_vector(w) for w in words]
    x = np.max(x, axis=0)
    return x


def load_fasttext_model():
    ft = fasttext.load_model("../input/fasttext-english/cc.en.300.bin")
    ft = fasttext.util.reduce_model(ft, 100)

    return ft


class FasttextEmbeddingBlock(BaseBlock):
    def __init__(self, column: str):
        self.column = column

    def fit(self, input_df, y=None, **kwargs):
        self.ft = load_fasttext_model()

        return self.transform(input_df)

    def transform(self, input_df):
        # この書き方知らなかった。
        emb = np.stack(input_df['comment_text'].map(lambda x: self.ft.get_sentence_vector(x)).values)
        output_df = pd.DataFrame(emb)
        return output_df.add_prefix(f'{self.column}_FastText')

In [None]:
@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time()
    yield
    d = time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)


def get_function(block, is_train):
    s = mapping = {
        True: 'fit',
        False: 'transform'
    }.get(is_train)
    return getattr(block, s)


def to_feature(input_df,
               blocks,
               is_train=False):
    out_df = pd.DataFrame()

    for block in tqdm(blocks, total=len(blocks)):
        func = get_function(block, is_train)

        with timer(prefix='create ' + str(block) + ' '):
            _df = func(input_df)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
    return reduce_mem_usage(out_df)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'
              .format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
valid_df_less = valid_df[["worker", "less_toxic"]].copy()
valid_df_less.rename(columns = {"less_toxic": "comment_text"}, inplace=True)

valid_df_more = valid_df[["worker", "more_toxic"]].copy()
valid_df_more.rename(columns = {"more_toxic": "comment_text"}, inplace=True)

test_df.rename(columns = {"text": "comment_text"}, inplace=True)

In [None]:
tqdm.pandas()
new_train_df['comment_text'] = new_train_df['comment_text'].progress_apply(text_cleaning)
valid_df_less['comment_text'] = valid_df_less['comment_text'].progress_apply(text_cleaning)
valid_df_more['comment_text'] = valid_df_more['comment_text'].progress_apply(text_cleaning)
test_df['comment_text'] = test_df['comment_text'].progress_apply(text_cleaning)

In [None]:
valid_df_less

In [None]:
process_blocks = [
    FasttextEmbeddingBlock("comment_text"),
    TfidfBlock("comment_text" ,whole_df=new_train_df, decomposition="svd", n_compose=500),
    CountVectorizerBlock("comment_text" ,whole_df=new_train_df, decomposition="svd", n_compose=500),
    StringLengthBlock("comment_text"),
    WordCountBlock("comment_text")
]

train_y = new_train_df["y"]
train_x = to_feature(new_train_df, process_blocks, is_train=True)
valid_less_x = to_feature(valid_df_less, process_blocks)
valid_more_x = to_feature(valid_df_more, process_blocks)
test_x = to_feature(test_df, process_blocks)

In [None]:
train_y

In [None]:
import os
import pickle
OUTPUT_DIR = './'   

process_blocks2 = [
    #FasttextEmbeddingBlock("comment_text"),
    TfidfBlock("comment_text" ,whole_df=new_train_df, decomposition="svd", n_compose=600),
    CountVectorizerBlock("comment_text" ,whole_df=new_train_df, decomposition="svd", n_compose=600),
    StringLengthBlock("comment_text"),
    WordCountBlock("comment_text")
]

file_name = os.path.join(OUTPUT_DIR,"test_prrocess")
pickle.dump(process_blocks2,open(file_name, 'wb'))

# LightGBM Model

In [None]:
import lightgbm as lgb

def get_scores(less_data, more_data, models):
    
    scores = []
    less_scores = []
    more_scores = []
    
    for i ,model in enumerate(models):
        less_score = model.predict(less_data)
        more_score = model.predict(more_data)
        score = (less_score < more_score).mean()
        
        less_scores.append(less_score)
        more_scores.append(more_score)
        scores.append(score)
        print(f">> model_{i} score = {score:.4f}")
        
    final_score = np.mean(scores)
    print(f">> total_score = {final_score:.4f}")
        
    return scores, less_scores, more_scores


def fit_lgbm(X, y, cv, params):
    metrics = mean_squared_error
    
    if params is None:
        params = {}
        
    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)
    
    for i, (idx_train, idx_valid) in enumerate(cv):
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        
        model = lgb.LGBMRegressor(**params)
        
        with timer(prefix="fit fold={} ".format(i+1)):
            model.fit(x_train, y_train,
                     eval_set=[(x_valid, y_valid)],
                     early_stopping_rounds=100,
                     verbose=1000)
            pickle.dump(model, open(f"./lightGBM_kfsky_fold{i}", 'wb'))
            
        pred_i = model.predict(x_valid)
        oof_pred[idx_valid] = pred_i
        
        models.append(model)
        
        print(f"Fold {i} RMSE: {metrics(y_valid, pred_i)**0.5: .4f}")
    
    score = metrics(y, oof_pred)**0.5
    print("TRAIN FINISHED | WHOLE RMSE: {:.4f}".format(score))
    return oof_pred, models


def run(train_x, train_y, less_data, more_data, test):
    fold = KFold(n_splits=CFG.fold, shuffle=True, random_state=CFG.seed)
    cv = list(fold.split(train_x, train_y.reset_index(drop=True)))
    
    print("==================== Start Training!! ====================")
    
    oof, models = fit_lgbm(train_x.values, train_y.reset_index(drop=True), cv, params=CFG.params)
    print("==================== Validation Score ====================")
    scores, less_scores, more_scores = get_scores(less_data.values, more_data.values, models)
    
    oof_df_train = pd.DataFrame()
    oof_df_train["oof_train"] = oof
    
    oof_df_valid = pd.DataFrame()
    oof_df_valid["oof_less"] = np.mean(less_scores, axis=0)
    oof_df_valid["oof_more"] = np.mean(more_scores, axis=0)
    
    print("==================== End Training!! ====================")
    
    print("==================== Start Predict!! ====================")
    
    preds = []
    
    for i, model in enumerate(models):
        with timer(prefix="predict fold={} ".format(i+1)):
            pred = model.predict(test.values)
            preds.append(pred)
            
    submission_score = np.mean(preds, axis=0)
    submission_df["score"] = submission_score
    
    submission_df.to_csv("submission.csv", index=False)
    oof_df_train.to_csv("oof_train.csv", index=False)
    oof_df_valid.to_csv("oof_valid.csv", index=False)
    
    print("==================== End Predict!! ====================")
    

run(train_x, train_y, valid_less_x, valid_more_x, test_x)

In [None]:
submission_df

In [None]:
sub_lgbm = submission_df.copy()

In [None]:
oof_lgbm = pd.read_csv("./oof_valid.csv")
oof_lgbm

# Ridge Model
https://www.kaggle.com/kengofujii/0-866-tfidf-ridge-simple-baseline/notebook?scriptVersionId=84856895

In [None]:
from sklearn.linear_model import Ridge

In [None]:
import os
import pickle
vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (2,5),max_features=46000)
X = vec.fit_transform(new_train_df['comment_text'])
pickle.dump(vec, open(f"./Ridge_tfidf", 'wb'))
X

In [None]:
model = Ridge(alpha=0.5)
# model = Ridge(alpha=0.485)
model.fit(X, train_y)
pickle.dump(model, open(f"./Ridge_kfsky", 'wb'))

In [None]:
valid_df_less

In [None]:
valid_df_more

In [None]:
X_less_toxic = vec.transform(valid_df_less['comment_text'])
X_more_toxic = vec.transform(valid_df_more['comment_text'])

p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)

In [None]:
# # Validation Accuracy
(p1 < p2).mean()

In [None]:
oof_ridge = pd.DataFrame({
    "oof_less_ridge": p1,
    "oof_more_ridge": p2
})
oof_ridge

In [None]:
oof_average = pd.concat([oof_lgbm, oof_ridge], axis=1)
oof_average

In [None]:
lgbm_ratio = None
ridge_ratio = None
best_score = -999
for i in range(1, 100):
    ratio = i/100
    _less_p = oof_average["oof_less"]*ratio + oof_average["oof_less_ridge"]*(1-ratio)
    _more_p = oof_average["oof_more"]*ratio + oof_average["oof_more_ridge"]*(1-ratio)
    
    score = (_less_p < _more_p).mean()
    
    if best_score < score:
        best_score = score
        print("="*30)
        print(f"LightGBM:Ridge = {i}:{100-i}")
        print(f"Best_Score is {round(best_score,4)}")
        
        lgbm_ratio = ratio
        ridge_ratio = 1-ratio
        
    else:
        pass

In [None]:
print(lgbm_ratio)
print(ridge_ratio)

8:2がベストっぽい？

## Submission Data

In [None]:
X_test = vec.transform(test_df['comment_text'])
p3 = model.predict(X_test)

In [None]:
p3

In [None]:
ridge_model = pickle.load(open("./Ridge_kfsky", 'rb'))
_p3 = model.predict(X_test)

In [None]:
_p3

In [None]:
final_pred = sub_lgbm["score"]*lgbm_ratio + p3*ridge_ratio

In [None]:
submission_df["score"] = final_pred

In [None]:
submission_df

In [None]:
submission_df.to_csv("submission.csv", index=False)
oof_average.to_csv("final_oof.csv", index=False)