### LightGBM_BERT_Simple_Baseline[0.765]

# Imports

In [None]:
import json
import pickle
import re
from glob import glob
import string
import regex
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from lightgbm import LGBMClassifier
from wordcloud import STOPWORDS
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings("ignore")
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
import torch
import transformers
from sklearn.base import TransformerMixin, BaseEstimator
from transformers import BertTokenizer, BertForMaskedLM
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.metrics.pairwise import linear_kernel
from transformers import AutoTokenizer, AutoModel

In [None]:
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

<h3>Read Training data</h3>

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(df.shape)
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

<h3>Reduce memory usage</h3>

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
df = reduce_mem_usage(df)
df_sub = reduce_mem_usage(df_sub)

In [None]:
print(df.shape)
print(df_sub.shape)

<h3>Prepare train and test data</h3>

In [None]:
df['severe_toxic'] = df.severe_toxic * 1.5
df['toxic'] = df.toxic * 0.32
df['obscene'] = df.obscene * 0.16
df['threat'] = df.threat * 1.5
df['insult'] = df.insult * 0.64
df['identity_hate'] = df.identity_hate * 1.5
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df['y'] = df['y']/df['y'].max()

df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

In [None]:
# df['severe_toxic'] = df.severe_toxic * 2.5
# # df['toxic'] = df.toxic * 0.32
# # df['obscene'] = df.obscene * 0.16
# df['threat'] = df.threat * 3
# # df['insult'] = df.insult * 0.64
# df['identity_hate'] = df.identity_hate * 2
# df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
# df['y'] = df['y']/df['y'].max()

# df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
# df.sample(5)

In [None]:
df['y'].value_counts()

<h3>Undersampling for train data</h3>

In [None]:
min_len = (df['y'] >= 0.1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=201)
df = pd.concat([df[df['y'] >= 0.1], df_y0_undersample])
df['y'].value_counts()

<h3>Merge train and df_sub to prepare total_data</h3>

In [None]:
total_data = pd.concat([df, df_sub],ignore_index=True)

In [None]:
display(total_data.head(2))
total_data_idx = df.shape[0]

In [None]:
print(total_data.shape)
print(df_sub.shape)
print(total_data_idx)

<h3>Text cleaning</h3>

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text


In [None]:
tqdm.pandas()
total_data['text'] = total_data['text'].progress_apply(text_cleaning)

<h3>Word count</h3>

In [None]:
# # word_count
# total_data['word_count'] = total_data['text'].apply(lambda x: len(str(x).split()))

# # unique_word_count
# total_data['unique_word_count'] = total_data['text'].apply(lambda x: len(set(str(x).split())))

# # stop_word_count
# total_data['stop_word_count'] = total_data['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# # mean_word_length
# total_data['mean_word_length'] = total_data['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# # char_count
# total_data['char_count'] = total_data['text'].apply(lambda x: len(str(x)))

# # punctuation_count
# total_data['punctuation_count'] = total_data['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

<h3>Cos similarity</h3>

In [None]:
# total_data_1 = total_data[total_data['y'] > 0.2]
# total_data_1 = total_data_1.reset_index()

In [None]:
# vectorizer = CountVectorizer()
# transformer = TfidfTransformer()
# tf_title = vectorizer.fit_transform(total_data_1['text'])
# tfid_title = transformer.fit_transform(tf_title)
# tfid_title.shape

In [None]:
# similarity_values=[]
# for i in range(0,len(total_data['text'])):
#     title_term = total_data['text'][i]
#     title_tf = vectorizer.transform([title_term])
#     title_tfidf = transformer.transform(title_tf)
#     similarity_title = cosine_similarity(title_tfidf, tfid_title)[0]
#     topn_indices = np.argsort(similarity_title)[::-1][:1]
#     value = similarity_title[topn_indices]
#     similarity_values.append(value)

In [None]:
# total_data['text_sim'] = similarity_values
# total_data['text_sim'] = total_data['text_sim'].astype(float)

<h3>Number of words for text</h3>

In [None]:
# total_data["num_text"] = total_data["text"].apply(lambda x: len(x))

<h3>Flag whether to include toxic words</h3>

In [None]:
total_data['Fuck'] = total_data['text'].str.contains('Fuck')
total_data.replace({'Fuck':{False:0,True:1}},inplace = True)
print(total_data['Fuck'].value_counts())

total_data['Nigger'] = total_data['text'].str.contains('Nigger')
total_data.replace({'Nigger':{False:0,True:1}},inplace = True)
print(total_data['Nigger'].value_counts())

total_data['Suck'] = total_data['text'].str.contains('Suck')
total_data.replace({'Suck':{False:0,True:1}},inplace = True)
print(total_data['Suck'].value_counts())

total_data['Don'] = total_data['text'].str.contains('Don')
total_data.replace({'Don':{False:0,True:1}},inplace = True)
print(total_data['Don'].value_counts())

total_data['Shit'] = total_data['text'].str.contains('Shit')
total_data.replace({'Shit':{False:0,True:1}},inplace = True)
print(total_data['Shit'].value_counts())

total_data['Fat'] = total_data['text'].str.contains('Fat')
total_data.replace({'Fat':{False:0,True:1}},inplace = True)
print(total_data['Fat'].value_counts())

total_data['Gay'] = total_data['text'].str.contains('Gay')
total_data.replace({'Gay':{False:0,True:1}},inplace = True)
print(total_data['Gay'].value_counts())

total_data['Faggot'] = total_data['text'].str.contains('Faggot')
total_data.replace({'Faggot':{False:0,True:1}},inplace = True)
print(total_data['Faggot'].value_counts())

total_data['Moron'] = total_data['text'].str.contains('Moron')
total_data.replace({'Moron':{False:0,True:1}},inplace = True)
print(total_data['Moron'].value_counts())

total_data['Ass'] = total_data['text'].str.contains('Ass')
total_data.replace({'Ass':{False:0,True:1}},inplace = True)
print(total_data['Ass'].value_counts())

total_data['Sucks'] = total_data['text'].str.contains('Sucks')
total_data.replace({'Sucks':{False:0,True:1}},inplace = True)
print(total_data['Sucks'].value_counts())

total_data['Jew'] = total_data['text'].str.contains('Jew')
total_data.replace({'Jew':{False:0,True:1}},inplace = True)
print(total_data['Jew'].value_counts())

total_data['Pig'] = total_data['text'].str.contains('Pig')
total_data.replace({'Pig':{False:0,True:1}},inplace = True)
print(total_data['Pig'].value_counts())

total_data['Stupid'] = total_data['text'].str.contains('Stupid')
total_data.replace({'Stupid':{False:0,True:1}},inplace = True)
print(total_data['Stupid'].value_counts())

total_data['Die'] = total_data['text'].str.contains('Die')
total_data.replace({'Die':{False:0,True:1}},inplace = True)
print(total_data['Die'].value_counts())

In [None]:
total_data.head(2)

<h3>BERT</h3>

In [None]:
class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = "../input/roberta-base"
#         self.model_name = "../input/distil-roberta-base"
#         self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
     
#         self.bert_model = transformers.RobertaModel.from_pretrained(self.model_name)  
        self.bert_model = transformers.AutoModel.from_pretrained(self.model_name)      
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128
#         self.max_len = 256
        
    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)
        
        seq_out = self.bert_model(inputs_tensor, masks_tensor)[0]
        pooled_out = self.bert_model(inputs_tensor, masks_tensor)[1]

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() 
        else:
            return seq_out[0][0].detach().numpy()

In [None]:
BSV = BertSequenceVectorizer()
total_data['text'] = total_data['text'].progress_apply(lambda x: BSV.vectorize(x) if x is not np.nan else np.array([0]*768))

In [None]:
bert = pd.DataFrame(total_data['text'].tolist())
bert.columns = ['text_bertvec_'+str(col) for col in bert.columns]

In [None]:
text_bert_df = pd.DataFrame(bert)
text_bert_df.head()

In [None]:
total_data.reset_index(drop=True, inplace=True)
total_data.head()

In [None]:
concat_df = pd.concat([total_data, text_bert_df], axis=1)
concat_df.shape

In [None]:
concat_df.head(2)

<h3>Prepare train and test data</h3>

In [None]:
# cat_cols = ['num_text','word_count','unique_word_count','stop_word_count','mean_word_length','char_count',
#             'punctuation_count','Fuck', 'Nigger', 'Suck', 'Don', 'Shit', 'Fat',
#             'Gay', 'Faggot', 'Moron', 'Ass','Sucks','Jew','Pig','Stupid','Die']
# num_cols = list(list(text_bert_df.columns)) 
# feat_cols = cat_cols + num_cols
# TARGET = 'y'

In [None]:
cat_cols = ['Fuck', 'Nigger', 'Suck', 'Don', 'Shit', 'Fat',
            'Gay', 'Faggot', 'Moron', 'Ass','Sucks','Jew','Pig','Stupid','Die']
num_cols = list(list(text_bert_df.columns)) 
feat_cols = cat_cols + num_cols
TARGET = 'y'

In [None]:
train_df = concat_df.iloc[:11000, :]
val_df = concat_df.iloc[11000:total_data_idx, :]
test_df = concat_df.iloc[total_data_idx:, :]
print(train_df.shape, val_df.shape, test_df.shape)

In [None]:
train_x = train_df[feat_cols]
train_y = train_df[TARGET]
val_x = val_df[feat_cols]
val_y = val_df[TARGET]
test_x = test_df[feat_cols]
test_y = test_df[TARGET]

In [None]:
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

<h3>LightGBM</h3>

In [None]:
params = {   
    'objective': 'regression',
#     'objective': 'regression_l1',
    'metric': 'rmse',
    'num_leaves': 32,
    'max_depth': 7,
    "feature_fraction": 0.8,
    'subsample_freq': 1,
    "bagging_fraction": 0.98,
    'min_data_in_leaf': 2,
    'learning_rate': 0.05,
    "boosting": "gbdt",
    "lambda_l1": 0.2,
    "lambda_l2": 10,
    "verbosity": -1,
    "random_state": 42,
    "num_boost_round": 8000,
    "early_stopping_rounds": 100
}

train_data = lgb.Dataset(train_x, label=train_y)
val_data = lgb.Dataset(val_x, label=val_y)

model = lgb.train(
    params,
    train_data, 
    categorical_feature = cat_cols,
    valid_names = ['train', 'valid'],
    valid_sets =[train_data, val_data], 
    verbose_eval = 100,
)

val_pred = model.predict(val_x, num_iteration=model.best_iteration)

pred_df = pd.DataFrame(sorted(zip(val_x.index, val_pred, val_y)), columns=['index', 'predict', 'actual'])

In [None]:
# lgb.plot_importance(model, figsize=(12,8), max_num_features=70, importance_type='gain')
# plt.tight_layout()
# plt.show()

<h3>Prediction</h3>

In [None]:
test_pred = model.predict(test_x, num_iteration=model.best_iteration)

In [None]:
# sub_df.iloc[:, 1:] = test_pred
df_sub['score'] = test_pred

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)  

In [None]:
df_sub