In [None]:
## install detoxify from dataset
!cp -r ../input/detoxify-sourcemodels/detoxify .
!pip install -q ./detoxify
!rm -r ./detoxify


## copy detoxify pretrained models and transformers configuration files from dataset to local caches
!mkdir -p  /root/.cache/torch/hub/checkpoints
!mkdir -p  /root/.cache/huggingface/transformers
!cp -r ../input/detoxify-sourcemodels/torch/hub/checkpoints /root/.cache/torch/hub
!cp -r ../input/detoxify-sourcemodels/huggingface/transformers /root/.cache/huggingface


# Setting environment variable TRANSFORMERS_OFFLINE=1 will tell Transformers to use local files only and will not try to look things up.
# It’s possible to run Transformers in a firewalled or a no-network environment or in a Kaggle inference kernel !
import os
import random
os.environ["TRANSFORMERS_OFFLINE"] = "1"

In [None]:
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    
class CFG:
    model_dir='../input/jigsaw4-luke-large-training-tito-cv-strategy/'
    num_workers=4
    model="studio-ousia/luke-large"
    batch_size=128
    fc_dropout=0.
    text="text"
    target="target"
    target_size=1
    head=32
    tail=32
    seed=42
    n_fold=5


CFG.max_len = CFG.head + CFG.tail

import os
import gc
import re
import sys
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -q transformers -y')
os.system('pip uninstall -q tokenizers -y')
os.system('pip uninstall -q huggingface_hub -y')

os.system('mkdir -p /tmp/pip/cache-tokenizers/')
os.system('cp ../input/tokenizers-0103/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl /tmp/pip/cache-tokenizers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-tokenizers/ tokenizers')

os.system('mkdir -p /tmp/pip/cache-huggingface-hub/')
os.system('cp ../input/huggingface-hub-008/huggingface_hub-0.0.8-py3-none-any.whl /tmp/pip/cache-huggingface-hub/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-huggingface-hub/ huggingface_hub')

os.system('mkdir -p /tmp/pip/cache-transformers/')
os.system('cp ../input/transformers-470/transformers-4.7.0-py3-none-any.whl /tmp/pip/cache-transformers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-transformers/ transformers')

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import LukeTokenizer, LukeModel, LukeConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_score(df):
    score = len(df[df['less_toxic_pred'] < df['more_toxic_pred']]) / len(df)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
print(test.shape, submission.shape)
display(test.head())
display(submission.head())

CFG.tokenizer = LukeTokenizer.from_pretrained(CFG.model_dir+'tokenizer/')

def prepare_input(text, cfg):
    if cfg.tail == 0:
        inputs = cfg.tokenizer.encode_plus(text, 
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           max_length=cfg.max_len,
                                           pad_to_max_length=True,
                                           truncation=True)
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = cfg.tokenizer.encode_plus(text,
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           truncation=True)
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_len:
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]])
            if k == 'input_ids':
                new_v = np.ones(cfg.max_len) * cfg.tokenizer.pad_token_id
            else:
                new_v = np.zeros(cfg.max_len)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df[cfg.text].fillna("none").values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = prepare_input(text, self.cfg)
        return inputs
    
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = LukeConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = LukeModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = LukeModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output
    
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
config_path = CFG.model_dir+"config.pth"
predictions = []
for fold in range(CFG.n_fold):
    model = CustomModel(CFG, config_path=config_path, pretrained=False)
    state = torch.load(CFG.model_dir+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state; gc.collect()
    torch.cuda.empty_cache()
    
submission['score'] = np.mean(predictions, axis=0)
comments_luke = submission[['comment_id', 'score']].copy()
comments_luke

In [None]:
from detoxify import Detoxify
import pandas as pd
import numpy as np


# each model takes in either a string or a list of strings
#results = Detoxify('original').predict('example text 1')
#results = Detoxify('unbiased').predict(['example text 1','example text 2'])
#results = Detoxify('multilingual').predict(['example text','exemple de text','texto de ejemplo','testo di esempio','texto de exemplo','örnek metin','пример текста'])

# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
#model = Detoxify('original', device='cuda')
#results = model.predict(df.text.head(10).to_list())
    
# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
#model = Detoxify('unbiased', device='cuda')
#results = model.predict(df.text.head(10).to_list())

# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
#model = Detoxify('multilingual', device='cuda')
#results = model.predict(df.text.head(10).to_list())

# to specify the device the model will be allocated on (defaults to cpu), accepts any torch.device input
model = Detoxify('original', device='cuda')
print(model.predict('Hello World...'))


model = Detoxify('unbiased', device='cuda')
print(model.predict('Hello World...'))


model = Detoxify('multilingual', device='cuda')
print(model.predict('Hello World...'))



auxi = pd.DataFrame(model.predict(['Hello World...','Hello World...']))
auxi

In [None]:
select = ['original','unbiased','multilingual']


In [None]:
SEL = 0
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

model = Detoxify(select[SEL], device='cuda')
auxi = pd.DataFrame(model.predict(['Hello World...']))
my_list = list(auxi.columns)

for f in my_list:
    comments[f]= 0.5
    
#comments = comments.head(100)    
for row in range(len(comments)):
    comments.loc[row,my_list] = np.array(list(model.predict(comments['text'].iloc[row]).values())).reshape(1,-1)[0]
    
comments

In [None]:
N_TRY = 600
validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

comments1 = comments[['comment_id', 'score']].copy()

comments1[['comment_id', 'score']]

In [None]:
SEL = 1
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

model = Detoxify(select[SEL], device='cuda')
auxi = pd.DataFrame(model.predict(['Hello World...']))
my_list = list(auxi.columns)

for f in my_list:
    comments[f]= 0.5
    
#comments = comments.head(100)    
for row in range(len(comments)):
    comments.loc[row,my_list] = np.array(list(model.predict(comments['text'].iloc[row]).values())).reshape(1,-1)[0]
    
comments

In [None]:
N_TRY = 700
validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

comments2 = comments[['comment_id', 'score']].copy()

comments2[['comment_id', 'score']]

In [None]:
SEL = 2
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

model = Detoxify(select[SEL], device='cuda')
auxi = pd.DataFrame(model.predict(['Hello World...']))
my_list = list(auxi.columns)

for f in my_list:
    comments[f]= 0.5
    
#comments = comments.head(100)    
for row in range(len(comments)):
    comments.loc[row,my_list] = np.array(list(model.predict(comments['text'].iloc[row]).values())).reshape(1,-1)[0]
    
comments

In [None]:
N_TRY = 700
validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

comments3 = comments[['comment_id', 'score']].copy()

comments3[['comment_id', 'score']]

In [None]:
import pandas as pd
import numpy as np

hspeech = pd.read_csv('../input/measuring-hate-speech/measuring_hate_speech.csv')

# get mean scores for each comment_id
scores_dict = hspeech.groupby('comment_id')['hate_speech_score'].apply(np.mean).to_dict()

# drop duplicate comment_ids
hspeech = hspeech.drop_duplicates(subset='comment_id')
hspeech['hate_speech_score'] = hspeech['comment_id'].map(scores_dict)
hspeech = hspeech[['comment_id','text','hate_speech_score']]

print(hspeech.text.nunique())

hspeech.columns = ['comment_id','text','y']

del scores_dict

import gc
gc.collect()

hspeech = hspeech.sample(frac=1,random_state=0).reset_index(drop=True)

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, SGDRegressor, LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import rankdata

from sklearn.svm import LinearSVR


jr = hspeech.copy()
jr.shape
df = jr[['text', 'y']]
vec = TfidfVectorizer(analyzer='char_wb', max_df=0.7, min_df=1, ngram_range=(2, 5), use_idf=False)
X = vec.fit_transform(df['text'])
z = df["y"].values
y=np.around ( z ,decimals = 2)

model1=Ridge(alpha=0.5)
model1.fit(X, y)
print(model1.score(X, y))

#model11=LinearSVR(random_state=0,C=20.0,max_iter=10000)  
#model11.fit(X, y)
#print(model11.score(X, y))

df_test = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

test=vec.transform(df_test['text'])
jr_preds=model1.predict(test)

df_test['score1']=rankdata(jr_preds, method='ordinal') 

#auxi = model11.predict(test)
#df_test['svr1']=rankdata(auxi, method='ordinal') 

rud_df = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
print(f"rud_df:{rud_df.shape}")
rud_df['y'] = rud_df["offensiveness_score"] 
df = rud_df[['txt', 'y']].rename(columns={'txt': 'text'})
vec = TfidfVectorizer(analyzer='char_wb', max_df=0.7, min_df=3, ngram_range=(3, 4), use_idf=False)
X = vec.fit_transform(df['text'])
z = df["y"].values
y=np.around ( z ,decimals = 1)
y

model1=Ridge(alpha=0.5)
model1.fit(X, y)
print(model1.score(X, y))

model11=LinearSVR(random_state=0,C=20.0,max_iter=10000)  
model11.fit(X, y)
print(model11.score(X, y))

test=vec.transform(df_test['text'])
rud_preds=model1.predict(test)#+0.5*model11.predict(test)

df_test['score2']=rankdata(rud_preds, method='ordinal') 

auxi = model11.predict(test)
df_test['svr2']=rankdata(auxi, method='ordinal') 

N_TRY = 300
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

comments['score1'] = df_test['score1']
#comments['score2'] = df_test['svr1']
comments['score3'] = df_test['score2']
comments['score4'] = df_test['svr2']


my_list = ['a','b','c']


validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

df_test0b = comments[['comment_id', 'score']].copy()
df_test0b

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, SGDRegressor, LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import rankdata

from sklearn.svm import LinearSVR


jr = hspeech.copy()
jr.shape
df = jr[['text', 'y']]
vec = TfidfVectorizer(analyzer='char_wb', max_df=0.7, min_df=1, ngram_range=(2, 5))
X = vec.fit_transform(df['text'])
z = df["y"].values
y=np.around ( z ,decimals = 2)

model1=Ridge(alpha=0.5)
model1.fit(X, y)
print(model1.score(X, y))

#model11=LinearSVR(random_state=0,C=20.0,max_iter=10000)  
#model11.fit(X, y)
#print(model11.score(X, y))

df_test = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

test=vec.transform(df_test['text'])
jr_preds=model1.predict(test)

df_test['score1']=rankdata(jr_preds, method='ordinal') 

#auxi = model11.predict(test)
#df_test['svr1']=rankdata(auxi, method='ordinal') 

rud_df = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
print(f"rud_df:{rud_df.shape}")
rud_df['y'] = rud_df["offensiveness_score"] 
df = rud_df[['txt', 'y']].rename(columns={'txt': 'text'})
vec = TfidfVectorizer(analyzer='char_wb', max_df=0.7, min_df=3, ngram_range=(3, 4))
X = vec.fit_transform(df['text'])
z = df["y"].values
y=np.around ( z ,decimals = 1)
y

model1=Ridge(alpha=0.5)
model1.fit(X, y)
print(model1.score(X, y))

model11=LinearSVR(random_state=0,C=20.0,max_iter=10000)  
model11.fit(X, y)
print(model11.score(X, y))

test=vec.transform(df_test['text'])
rud_preds=model1.predict(test)#+0.5*model11.predict(test)

df_test['score2']=rankdata(rud_preds, method='ordinal') 

auxi = model11.predict(test)
df_test['svr2']=rankdata(auxi, method='ordinal') 

N_TRY = 300
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

comments['score1'] = df_test['score1']
#comments['score2'] = df_test['svr1']
comments['score3'] = df_test['score2']
comments['score4'] = df_test['svr2']


my_list = ['a','b','c']


validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

df_test0 = comments[['comment_id', 'score']].copy()
df_test0

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import re 
import scipy
from scipy import sparse

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from sklearn.svm import SVR


df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

df_train = hspeech.copy()
df_train = df_train.rename(columns={'txt':'text'})

def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

tqdm.pandas()
df_train['text'] = df_train['text'].progress_apply(text_cleaning)
df = df_train.copy()

vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))
X = vec.fit_transform(df['text'])
model = Ridge(alpha=0.5)
model.fit(X, df['y'])
l_model = Ridge(alpha=1.)
l_model.fit(X, df['y'])
s_model = Ridge(alpha=2.)
s_model.fit(X, df['y'])
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
tqdm.pandas()
df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])
p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)
# Validation Accuracy
print(f'val : {(p1 < p2).mean()}')
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)
X_test = vec.transform(df_sub['text'])
p3 = model.predict(X_test)
p4 = l_model.predict(X_test)
p5 = s_model.predict(X_test)
df_sub['score'] = (p3 + p4 + p5) / 3.
df_sub['score'] = df_sub['score']

df_test0000 = df_sub.copy()


#df_test0000['score'] = rankdata(df_test0000['score'], method='ordinal') 
df_test0000

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import re 
import scipy
from scipy import sparse

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from sklearn.svm import SVR

df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

df_train = hspeech.copy()
df_train = df_train.rename(columns={'txt':'text'})

def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

tqdm.pandas()
df_train['text'] = df_train['text'].progress_apply(text_cleaning)
df = df_train.copy()

vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5),use_idf=False )
X = vec.fit_transform(df['text'])
model = Ridge(alpha=0.5)
model.fit(X, df['y'])
l_model = Ridge(alpha=1.)
l_model.fit(X, df['y'])
s_model = Ridge(alpha=2.)
s_model.fit(X, df['y'])
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
tqdm.pandas()
df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])
p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)
# Validation Accuracy
print(f'val : {(p1 < p2).mean()}')
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)
X_test = vec.transform(df_sub['text'])
p3 = model.predict(X_test)
p4 = l_model.predict(X_test)
p5 = s_model.predict(X_test)
df_sub['score'] = (p3 + p4 + p5) / 3.


df_test0000b = df_sub.copy()

#df_test0000b['score'] = rankdata(df_test0000b['score'], method='ordinal') 
df_test0000b

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import re 
import scipy
from scipy import sparse

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from sklearn.svm import SVR


df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")


df_train = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
print(f"rud_df:{df_train.shape}")
df_train['y'] = df_train["offensiveness_score"] 
df_train = df_train[['txt', 'y']].rename(columns={'txt': 'text'})



def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

tqdm.pandas()
df_train['text'] = df_train['text'].progress_apply(text_cleaning)
df = df_train.copy()

vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))
X = vec.fit_transform(df['text'])
model = Ridge(alpha=0.5)
model.fit(X, df['y'])
l_model = Ridge(alpha=1.)
l_model.fit(X, df['y'])
s_model = Ridge(alpha=2.)
s_model.fit(X, df['y'])
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
tqdm.pandas()
df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])
p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)
# Validation Accuracy
print(f'val : {(p1 < p2).mean()}')
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)
X_test = vec.transform(df_sub['text'])
p3 = model.predict(X_test)
p4 = l_model.predict(X_test)
p5 = s_model.predict(X_test)
df_sub['score'] = (p3 + p4 + p5) / 3.
df_sub['score'] = df_sub['score']

df_test00 = df_sub.copy()


#df_test0000['score'] = rankdata(df_test0000['score'], method='ordinal') 
df_test00


import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import re 
import scipy
from scipy import sparse

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from sklearn.svm import SVR

df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

df_train = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
print(f"rud_df:{df_train.shape}")
df_train['y'] = df_train["offensiveness_score"] 
df_train = df_train[['txt', 'y']].rename(columns={'txt': 'text'})

def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

tqdm.pandas()
df_train['text'] = df_train['text'].progress_apply(text_cleaning)
df = df_train.copy()

vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))
X = vec.fit_transform(df['text'])
model = Ridge(alpha=0.5)
model.fit(X, df['y'])
l_model = Ridge(alpha=1.)
l_model.fit(X, df['y'])
s_model = Ridge(alpha=2.)
s_model.fit(X, df['y'])
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
tqdm.pandas()
df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])
p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)
# Validation Accuracy
print(f'val : {(p1 < p2).mean()}')
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)
X_test = vec.transform(df_sub['text'])
p3 = model.predict(X_test)
p4 = l_model.predict(X_test)
p5 = s_model.predict(X_test)
df_sub['score'] = (p3 + p4 + p5) / 3.


df_test00b = df_sub.copy()

#df_test0000b['score'] = rankdata(df_test0000b['score'], method='ordinal') 
df_test00b

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import rankdata
import os

from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.feature_extraction.text import TfidfVectorizer

test_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
valid_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv")
train_df=pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")

train = train_df[["txt", "offensiveness_score"]]

tfvec = TfidfVectorizer(analyzer = 'char_wb', ngram_range = (3,5))
tfv = tfvec.fit_transform(train["txt"])

X=tfv
Y=train['offensiveness_score']

reg = LinearRegression().fit(X,Y)
print(reg.score(X,Y))

model11=LinearSVR(random_state=0,C=20.0,max_iter=10000) 
model11.fit(X, Y)
print(model11.score(X,Y))

tfv_comments = tfvec.transform(test_df["text"])
pred1 = reg.predict(tfv_comments)#+0.5*model11.predict(tfv_comments)

auxi1 = model11.predict(tfv_comments)


data2 = hspeech.copy()
df2 = data2[['text', 'y']]

vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 5))
X = vec.fit_transform(df2['text'])
w = df2["y"].values
y = np.around (w ,decimals = 2)

from sklearn.linear_model import Ridge
reg2=Ridge(alpha=0.3)
reg2.fit(X, y)
print(reg2.score(X,y))

#model11=LinearSVR(random_state=0,C=20.0,max_iter=10000)  
#model11.fit(X, y)
#print(model11.score(X,y))

test=vec.transform(test_df['text'])
pred2=reg2.predict(test)#+0.5*model11.predict(test)

#auxi2 = model11.predict(test)

sub = pd.DataFrame()
sub["comment_id"] = test_df["comment_id"]
sub["score"] = pred1 + pred2

sub["svr"] = auxi1 #+ auxi2

df_test = sub.copy()

df_test['score1']=rankdata(pred1, method='ordinal') 
df_test['svr1']=rankdata(auxi1, method='ordinal') 
df_test['score2']=rankdata(pred2, method='ordinal') 
#df_test['svr2']=rankdata(auxi2, method='ordinal') 


#df_test['score']=rankdata(df_test['score'], method='ordinal') 
#df_test['svr']=rankdata(df_test['svr'], method='ordinal') 

df_test000 = df_test.copy()
df_test000

N_TRY = 300
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

comments['score1'] = df_test000['score1']
comments['score2'] = df_test000['svr1']
comments['score3'] = df_test000['score2']
#comments['score4'] = df_test000['svr2']


my_list = ['a','b','c']


validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

df_test000 = comments[['comment_id', 'score']].copy()
df_test000

from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer

test_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")
valid_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv")
train_df=pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")

train = train_df[["txt", "offensiveness_score"]]

tfvec = TfidfVectorizer(analyzer = 'char_wb', ngram_range = (3,5), use_idf = False)
tfv = tfvec.fit_transform(train["txt"])

X=tfv
Y=train['offensiveness_score']

reg = LinearRegression().fit(X,Y)
print(reg.score(X,Y))

model11=LinearSVR(random_state=0,C=20.0,max_iter=10000) 
model11.fit(X, Y)
print(model11.score(X,Y))

tfv_comments = tfvec.transform(test_df["text"])
pred1 = reg.predict(tfv_comments)#+0.5*model11.predict(tfv_comments)

auxi1 = model11.predict(tfv_comments)


data2 = hspeech.copy()
df2 = data2[['text', 'y']]

vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 5), use_idf = False)
X = vec.fit_transform(df2['text'])
w = df2["y"].values
y = np.around (w ,decimals = 2)

from sklearn.linear_model import Ridge
reg2=Ridge(alpha=0.3)
reg2.fit(X, y)
print(reg2.score(X,y))

#model11=LinearSVR(random_state=0,C=20.0,max_iter=10000)  
#model11.fit(X, y)
#print(model11.score(X,y))

test=vec.transform(test_df['text'])
pred2=reg2.predict(test)#+0.5*model11.predict(test)

#auxi2 = model11.predict(test)

sub = pd.DataFrame()
sub["comment_id"] = test_df["comment_id"]
sub["score"] = pred1 + pred2

sub["svr"] = auxi1# + auxi2

df_test = sub.copy()

df_test['score1']=rankdata(pred1, method='ordinal') 
df_test['svr1']=rankdata(auxi1, method='ordinal') 
df_test['score2']=rankdata(pred2, method='ordinal') 
#df_test['svr2']=rankdata(auxi2, method='ordinal') 


#df_test['score']=rankdata(df_test['score'], method='ordinal') 
#df_test['svr']=rankdata(df_test['svr'], method='ordinal') 

df_test000b = df_test.copy()
df_test000b

N_TRY = 300
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

comments['score1'] = df_test000b['score1']
comments['score2'] = df_test000b['svr1']
comments['score3'] = df_test000b['score2']
#comments['score4'] = df_test000b['svr2']


my_list = ['a','b','c']


validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

df_test000b = comments[['comment_id', 'score']].copy()
df_test000b

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
import re 
import scipy
from scipy import sparse
import gc 
from IPython.display import display, HTML
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")


df = hspeech.copy()

n_folds = 7


for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = df.sample(frac=0.7,random_state=fld)

    tmp_df.to_csv(f'/kaggle/working/df_fld{fld}.csv', index=False)
    
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

def clean(data, col):
    
    data[col] = data[col].str.replace(r"what's", "what is ")    
    data[col] = data[col].str.replace(r"\'ve", " have ")
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ")
    data[col] = data[col].str.replace(r"i'm", "i am ")
    data[col] = data[col].str.replace(r"\'re", " are ")
    data[col] = data[col].str.replace(r"\'d", " would ")
    data[col] = data[col].str.replace(r"\'ll", " will ")
    data[col] = data[col].str.replace(r"\'scuse", " excuse ")
    data[col] = data[col].str.replace(r"\'s", " ")

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    data[col] = data[col].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    
    return data

# Test clean function
test_clean_df = pd.DataFrame({"text":
                              ["heyy\n\nkkdsfj",
                               "hi   how/are/you ???",
                               "hey?????",
                               "noooo!!!!!!!!!   comeone !! ",
                              "cooooooooool     brooooooooooo  coool brooo",
                              "naaaahhhhhhh"]})
display(test_clean_df)
clean(test_clean_df,'text')

df = clean(df,'text')

n_folds = 7


for fld in range(n_folds):
    tmp_df = df.sample(frac=0.7,random_state=fld+55)

    tmp_df.to_csv(f'/kaggle/working/df_clean_fld{fld}.csv', index=False)
    print(tmp_df.shape)
    print(tmp_df['y'].value_counts())
    
del df,tmp_df
gc.collect()

df_ = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
print(df_.shape)

df_ = df_[['txt', 'offensiveness_score']].rename(columns={'txt': 'text',
                                                                'offensiveness_score':'y'})

df_['y'] = (df_['y'] - df_.y.min()) / (df_.y.max() - df_.y.min()) 

n_folds = 7


for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = df_.sample(frac=0.7,random_state=fld+555)
    tmp_df.to_csv(f'/kaggle/working/df2_fld{fld}.csv', index=False)
    print(tmp_df.shape)
    print(tmp_df['y'].value_counts())
    
del tmp_df, df_; 
gc.collect()


df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

class LengthUpperTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return sparse.csr_matrix([[sum([1 for y in x if y.isupper()])/len(x)] for x in X])
    def get_feature_names(self):
        return ["lngth_uppercase"]
    
val_preds_arr1 = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2 = np.zeros((df_val.shape[0], n_folds))
test_preds_arr = np.zeros((df_sub.shape[0], n_folds))

for fld in range(n_folds):
    print("\n\n")
    print(f' ****************************** FOLD: {fld} ******************************')
    df = pd.read_csv(f'/kaggle/working/df_fld{fld}.csv')
    print(df.shape)

    features = FeatureUnion([
        ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),

    ])
    
    pipeline = Pipeline(
        [
            ("features", features),
            ("clf", Ridge())
        ]
    )
    print("\nTrain:")
    # Train the pipeline
    pipeline.fit(df['text'], df['y'])
    
    # What are the important features for toxicity

    print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    
    #print("\npredict validation data ")
    #val_preds_arr1[:,fld] = pipeline.predict(df_val['less_toxic'])
    #val_preds_arr2[:,fld] = pipeline.predict(df_val['more_toxic'])

    print("\npredict test data ")
    test_preds_arr[:,fld] = pipeline.predict(df_sub['text'])
    
val_preds_arr1c = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2c = np.zeros((df_val.shape[0], n_folds))
test_preds_arrc = np.zeros((df_sub.shape[0], n_folds))

for fld in range(n_folds):
    print("\n\n")
    print(f' ****************************** FOLD: {fld} ******************************')
    df = pd.read_csv(f'/kaggle/working/df_clean_fld{fld}.csv')
    print(df.shape)

    features = FeatureUnion([
        ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),

    ])
    pipeline = Pipeline(
        [
            ("features", features),
            ("clf", Ridge()),
        ]
    )
    print("\nTrain:")
    # Train the pipeline
    pipeline.fit(df['text'], df['y'])
    
    # What are the important features for toxicity

    print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    
    #print("\npredict validation data ")
    #val_preds_arr1[:,fld] = pipeline.predict(df_val['less_toxic'])
    #val_preds_arr2[:,fld] = pipeline.predict(df_val['more_toxic'])


    print("\npredict test data ")
    test_preds_arrc[:,fld] = pipeline.predict(df_sub['text'])
    
val_preds_arr1_ = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2_ = np.zeros((df_val.shape[0], n_folds))
test_preds_arr_ = np.zeros((df_sub.shape[0], n_folds))

for fld in range(n_folds):
    print("\n\n")
    print(f' ****************************** FOLD: {fld} ******************************')
    df = pd.read_csv(f'/kaggle/working/df2_fld{fld}.csv')
    print(df.shape)

    features = FeatureUnion([
        ("vect3", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),

    ])
    pipeline = Pipeline(
        [
            ("features", features),
            ("clf", Ridge()),
        ]
    )
    print("\nTrain:")
    
    # Train the pipeline
    pipeline.fit(df['text'], df['y'])
    
    # What are the important features for toxicity

    print('\nTotal number of features:', len(pipeline['features'].get_feature_names()) )

    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), 
                                  np.round(pipeline['clf'].coef_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    pprint(feature_wts[:30])
    
    #print("\npredict validation data ")
    #val_preds_arr1[:,fld] = pipeline.predict(df_val['less_toxic'])
    #val_preds_arr2[:,fld] = pipeline.predict(df_val['more_toxic'])


    print("\npredict test data ")
    test_preds_arr_[:,fld] = pipeline.predict(df_sub['text'])
    
del df, pipeline, feature_wts
gc.collect()

p1 = test_preds_arr.mean(axis=1)
p2 = test_preds_arr_.mean(axis=1)
p3 = test_preds_arrc.mean(axis=1)

N_TRY = 300
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

comments['score1'] = rankdata(p1, method='ordinal') 
comments['score2'] = rankdata(p2, method='ordinal') 
comments['score3'] = rankdata(p3, method='ordinal') 


my_list = ['a','b','c']


validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

df_sub = comments[['comment_id', 'score']].copy()
df_sub

N_TRY = 400
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

comments['score1'] = df_test0b['score']#.rank()
comments['score2'] = df_test00b['score']#.rank()
comments['score3'] = df_test000b['score']#.rank()
comments['score4'] = df_test0000b['score']#.rank()


my_list = ['a','b','c','d']


validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

comments_0b = comments[['comment_id', 'score']].copy()
comments_0b

N_TRY = 500
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

comments['score1'] = df_test0['score']#.rank()
comments['score2'] = df_test00['score']#.rank()
comments['score3'] = df_test000['score']#.rank()
comments['score4'] = df_test0000['score']#.rank()
comments['score5'] = df_sub['score']#.rank()

my_list = ['a','b','c','d','d']


validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

comments_0 = comments[['comment_id', 'score']].copy()
comments_0

N_TRY = 200
comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

comments['score1'] = comments_0['score']#.rank()
comments['score2'] = comments_0b['score']#.rank()


my_list = ['a','b']


validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]

ruddit_hate = comments[['comment_id', 'score']].copy()
ruddit_hate

In [None]:
N_TRY = 400

comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

comments['score1'] = comments1['score'].rank()
comments['score2'] = comments2['score'].rank()
comments['score3'] = comments3['score'].rank()
comments['score4'] = ruddit_hate['score'].rank()



my_list = ['original','unbiased','multilingual','ruddit']#,'comments_luke']#'luke_base','luke_large']


validation = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

validation_annotations = validation.set_index('more_toxic').join(
    comments[['comment_id', 'text']].set_index('text'), how='inner').set_index('less_toxic').join(
    comments[['comment_id', 'text']].set_index('text'),
    lsuffix='_more', rsuffix='_less', how='inner').reset_index(drop=True)

best_score = 0
best_w = 0
for _ in range(N_TRY):
    w = np.random.rand(len(my_list))
    w = w / np.linalg.norm(w)
    scores =  { c[1].comment_id: sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows() }
    percent_correct = 100.0 * (
        sum(validation_annotations['comment_id_more'].map(scores).gt(
            validation_annotations['comment_id_less'].map(scores)))
        / len(validation_annotations))
    print(percent_correct)
    if percent_correct > best_score:
        best_score = percent_correct
        best_w = w
        
        
print('----------')
print('----------')
print('----------')

print(best_w)
print(best_score)

comments['score'] = [sum(w * c[1][2:(2+len(my_list))]) for c in comments.iterrows()]


In [None]:
comments['score'] = (3/4)*comments['score'].rank() + (1/4)*comments_luke['score'].rank()

In [None]:
if len(comments) != comments['score'].nunique():
    print(len(comments) == comments['score'].nunique())
    comments = comments.sample(n=len(comments),random_state=0)
    comments['score'] = comments['score'].rank(method='first')
    
print(len(comments) == comments['score'].nunique())

In [None]:
comments[['comment_id', 'score']].to_csv('submission.csv', index=False)
comments[['comment_id', 'score']]