**Private Score 0.80048**

**Public Score 0.86644**

In [None]:
!pip install datasets

In [None]:
!pip install ../input/sklearn01/scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sb
import sentencepiece

In [None]:
encoding="utf-8"
glove_big = {}
with open("../input/glove-840b-300d/glove.840B.300d.txt", "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        nums=np.array(parts[1:], dtype=np.float32)
        glove_big[word] = nums

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_big))])
        else:
            self.dim=0
            
    def fit(self, X ,y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [None]:
import numpy as np
import torch
import transformers

from transformers import RobertaTokenizer


class BertSequenceVectorizer:
    def __init__(self, model_name="bert-base-uncased", max_len=128):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = model_name
        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.RobertaModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = max_len

    def vectorize(self, sentence: str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out = bert_out['last_hidden_state']
#         seq_out = self.bert_model(inputs_tensor, masks_tensor)[0]
#         pooled_out = self.bert_model(inputs_tensor, masks_tensor)[1]

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()
        

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR
from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer,CountVectorizer
import scipy
from scipy.stats import rankdata
import nltk

stop_words = nltk.corpus.stopwords.words('english')

jr = pd.read_csv("../input/jigsaw-regression-based-data/train_data_version2.csv")
jr.shape
df = jr[['text', 'y']]
df = df.drop_duplicates()
BSV = BertSequenceVectorizer(model_name="../input/unbiasedtoxicroberta/unbiased-toxic-roberta", max_len=512)
X1 = np.stack(df["text"].map(lambda x: BSV.vectorize(x).reshape(-1)).values)
vec2 = TfidfVectorizer(analyzer='char_wb', max_df=0.7, min_df=1, ngram_range=(2, 5), stop_words=stop_words)
X2 = vec2.fit_transform(df['text'])
X = scipy.sparse.hstack([X1, X2])
del X2
z = df["y"].values
y=np.around ( z ,decimals = 2)
model1=LinearSVR(max_iter=1000000)
model1.fit(X, y)
model2=Ridge(alpha=0.5)
model2.fit(X, y)
model3=Ridge(alpha=1.0)
model3.fit(X, y)

VALID_DATA_PATH = "/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv"
df_valid2 = pd.read_csv(VALID_DATA_PATH)
less_toxic1 = np.stack(df_valid2['less_toxic'].map(lambda x: BSV.vectorize(x).reshape(-1)).values)
more_toxic1 = np.stack(df_valid2['more_toxic'].map(lambda x: BSV.vectorize(x).reshape(-1)).values)
less_toxic2 = vec2.transform(df_valid2['less_toxic'])
more_toxic2 = vec2.transform(df_valid2['more_toxic'])
less_toxic = scipy.sparse.hstack([less_toxic1, less_toxic2])
more_toxic = scipy.sparse.hstack([more_toxic1, more_toxic2])
del less_toxic2,more_toxic2
y_pred_less = model1.predict(less_toxic)
y_pred_more = model1.predict(more_toxic)
print(f'val : {(y_pred_less < y_pred_more).mean()}')
y_pred_less2 = model2.predict(less_toxic)
y_pred_more2 = model2.predict(more_toxic)
print(f'val : {(y_pred_less2 < y_pred_more2).mean()}')
y_pred_less3 = model3.predict(less_toxic)
y_pred_more3 = model3.predict(more_toxic)
print(f'val : {(y_pred_less3 < y_pred_more3).mean()}')

df_test = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
test1 = np.stack(df_test['text'].map(lambda x: BSV.vectorize(x).reshape(-1)).values)
test2 = vec2.transform(df_test['text'])
test = scipy.sparse.hstack([test1, test2])
del test2
jr_preds=model1.predict(test)
df_test['score1']=rankdata( jr_preds, method='ordinal') 
jr_preds11=model2.predict(test)
df_test['score11']=rankdata( jr_preds11, method='ordinal') 
jr_preds111=model3.predict(test)
df_test['score111']=rankdata( jr_preds111, method='ordinal') 
df_test['score1']=(df_test['score1']+df_test['score11']+df_test['score111'])/3

rud_df = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
rud_df.loc[((rud_df["txt"]=="[deleted]")|(rud_df["txt"]=="[removed]")),"txt"] = np.nan
rud_df=rud_df.dropna(how="any")
#print(f"rud_df:{rud_df.shape}")
rud_df['y'] = rud_df["offensiveness_score"] 
df = rud_df[['txt', 'y']].rename(columns={'txt': 'text'})
X = np.stack(df["text"].map(lambda x: BSV.vectorize(x).reshape(-1)).values)
vec2 = TfidfVectorizer(analyzer='char_wb', max_df=0.7, min_df=3, ngram_range=(3, 4), stop_words=stop_words)
X2 = vec2.fit_transform(df['text'])
X = scipy.sparse.hstack([X, X2])
del X2
z = df["y"].values
y=np.around ( z ,decimals = 1)
model1=Ridge(alpha=0.1)
model1.fit(X, y)
model2=Ridge(alpha=0.5)
model2.fit(X, y)
model3=LinearSVR(max_iter=1000000)
model3.fit(X, y)

VALID_DATA_PATH = "/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv"
df_valid2 = pd.read_csv(VALID_DATA_PATH)
less_toxic2 = vec2.transform(df_valid2['less_toxic'])
more_toxic2 = vec2.transform(df_valid2['more_toxic'])
less_toxic = scipy.sparse.hstack([less_toxic1, less_toxic2])
more_toxic = scipy.sparse.hstack([more_toxic1, more_toxic2])
del less_toxic2,more_toxic2
y_pred_less = model1.predict(less_toxic)
y_pred_more = model1.predict(more_toxic)
print(f'val : {(y_pred_less < y_pred_more).mean()}')
y_pred_less2 = model2.predict(less_toxic)
y_pred_more2 = model2.predict(more_toxic)
print(f'val : {(y_pred_less2 < y_pred_more2).mean()}')
y_pred_less3 = model3.predict(less_toxic)
y_pred_more3 = model3.predict(more_toxic)
print(f'val : {(y_pred_less3 < y_pred_more3).mean()}')

test2 = vec2.transform(df_test['text'])
test = scipy.sparse.hstack([test1, test2])
del test2
rud_preds=model1.predict(test)
df_test['score2']=rankdata( rud_preds, method='ordinal')
rud_preds22=model2.predict(test)
df_test['score22']=rankdata( rud_preds22, method='ordinal')
rud_preds222=model3.predict(test)
df_test['score222']=rankdata( rud_preds222, method='ordinal')
df_test['score2']=(df_test['score2']+df_test['score22']+df_test['score222'])/3
df_test['score']=df_test['score1']+df_test['score2']
df_test['score']=rankdata( df_test['score'], method='ordinal')
df_test[['comment_id', 'score']].to_csv("submission1.csv", index=False)

[Sub2 : 0.860](https://www.kaggle.com/coldfir3/tokenizer-training-tfidf-ridge-lb-0-860)

Credit : [Tokenizer training + TFIDF + RIDGE [LB 0.860]](https://www.kaggle.com/coldfir3/tokenizer-training-tfidf-ridge-lb-0-860) by [Adriano Passos](https://www.kaggle.com/coldfir3)

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

TRAIN_DATA_PATH = "/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv"
VALID_DATA_PATH = "/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv"
TEST_DATA_PATH = "/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv"

df_train2 = pd.read_csv(TRAIN_DATA_PATH)
df_valid2 = pd.read_csv(VALID_DATA_PATH)
df_test2 = pd.read_csv(TEST_DATA_PATH)
cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in cat_mtpl:
    df_train2[category] = df_train2[category] * cat_mtpl[category]

df_train2['score'] = df_train2.loc[:, 'toxic':'identity_hate'].mean(axis=1)
df_train2['y'] = df_train2['score']

min_len = (df_train2['y'] > 0).sum()  # len of toxic comments
df_y0_undersample = df_train2[df_train2['y'] == 0].sample(n=min_len, random_state=41)  # take non toxic comments
df_train_new = pd.concat([df_train2[df_train2['y'] > 0], df_y0_undersample])  # make new df
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

raw_tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
raw_tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)
from datasets import Dataset

dataset = Dataset.from_pandas(df_train_new[['comment_text']])

def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["comment_text"]

raw_tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

def dummy_fun(doc):
    return doc

labels = df_train_new['y']
comments = df_train_new['comment_text']
tokenized_comments = tokenizer(comments.to_list())['input_ids']

vectorizer = TfidfVectorizer(
    analyzer = 'word',
    tokenizer = dummy_fun,
    preprocessor = dummy_fun,
    token_pattern = None)

comments_tr = vectorizer.fit_transform(tokenized_comments)

regressor = Ridge(random_state=42, alpha=0.8)
regressor.fit(comments_tr, labels)
regressor2 = Ridge(random_state=42, alpha=0.6)
regressor2.fit(comments_tr, labels)
regressor3 = Ridge(random_state=42, alpha=1.0)
regressor3.fit(comments_tr, labels)

less_toxic_comments = df_valid2['less_toxic']
more_toxic_comments = df_valid2['more_toxic']

less_toxic_comments = tokenizer(less_toxic_comments.to_list())['input_ids']
more_toxic_comments = tokenizer(more_toxic_comments.to_list())['input_ids']

less_toxic = vectorizer.transform(less_toxic_comments)
more_toxic = vectorizer.transform(more_toxic_comments)

# make predictions
y_pred_less = regressor.predict(less_toxic)
y_pred_more = regressor.predict(more_toxic)
y_pred_less2 = regressor2.predict(less_toxic)
y_pred_more2 = regressor2.predict(more_toxic)
y_pred_less3 = regressor3.predict(less_toxic)
y_pred_more3 = regressor3.predict(more_toxic)

print(f'val : {(y_pred_less < y_pred_more).mean()}')
print(f'val : {(y_pred_less2 < y_pred_more2).mean()}')
print(f'val : {(y_pred_less3 < y_pred_more3).mean()}')
texts = df_test2['text']
texts = tokenizer(texts.to_list())['input_ids']
texts = vectorizer.transform(texts)

df_test2['prediction'] = regressor.predict(texts)
df_test2['prediction2'] = regressor2.predict(texts)
df_test2['prediction3'] = regressor3.predict(texts)
df_test2 = df_test2[['comment_id','prediction','prediction2','prediction3']]

df_test2['score'] = (df_test2['prediction'] + df_test2['prediction2'] + df_test2['prediction3'])/3
df_test2 = df_test2[['comment_id','score']]

df_test2.to_csv('./submission2.csv', index=False)

[Sub3 : 0.858](https://www.kaggle.com/tenffe/rapids-tfidf-linear-model-ensemble/notebook)

Credit : [[RAPIDS] TFIDF_linear_model_ensemble](https://www.kaggle.com/tenffe/rapids-tfidf-linear-model-ensemble/notebook) by [zhangxin](https://www.kaggle.com/tenffe)

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer

import re 
import scipy
from scipy import sparse

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from sklearn.svm import SVR

df_train = pd.read_csv("../input/jigsawtoxicdataset05/jigsaw-toxic-comment-train.csv")
# df_train = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv")
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in cat_mtpl:
    df_train[category] = df_train[category] * cat_mtpl[category]

df_train['score'] = df_train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

df_train['y'] = df_train['score']

min_len = (df_train['y'] > 0).sum()  # len of toxic comments
df_y0_undersample = df_train[df_train['y'] == 0].sample(n=min_len, random_state=201, replace=True)  # take non toxic comments
df_train_new = pd.concat([df_train[df_train['y'] > 0], df_y0_undersample])  # make new df
df_train = df_train_new.rename(columns={'comment_text':'text'})

def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    #remove multiple toxic words
#     text = re.sub(r"#ofc", " of fuckin course ",text)
#     text = re.sub(r"fggt", " faggot ",text)
#     text = re.sub(r"cuntbag", " cunt bag ",text)
#     text = re.sub(r"fartchina", " fart china ",text)    
#     text = re.sub(r"youi", " you i ",text)
#     text = re.sub(r"cunti", " cunt i ",text)
#     text = re.sub(r"sucki", " suck i ",text)
#     text = re.sub(r"pagedelete", " page delete ",text)
#     text = re.sub(r"offuck", " of fuck ",text)
#     text = re.sub(r"centraliststupid", " central ist stupid ",text)
#     text = re.sub(r"hitleri", " hitler i ",text)
#     text = re.sub(r"f u c k", " fuck ",text)
#     text = re.sub(r"bunksteve", " bunk steve ",text)
#     text = re.sub(r"sexsex", " sex ",text)
#     text = re.sub(r"youbollocks", " you bollocks ",text)
#     text = re.sub(r"mothjer", " mother ",text)
#     text = re.sub(r"cuntfranks", " cunt ",text)
#     text = re.sub(r"ullmann", " jewish ",text)
#     text = re.sub(r"mr.", " mister ",text)
#     text = re.sub(r"aidsaids", " aids ",text)
#     text = re.sub(r"njgw", " nigger ",text)
#     text = re.sub(r"administrator", " admin ",text)
#     text = re.sub(r"gamaliel", " jewish ",text)
#     text = re.sub(r"rvv", " vanadalism ",text)
#     text = re.sub(r"admins", " admin ",text)
#     text = re.sub(r"pensnsnniensnsn", " penis ",text)
#     text = re.sub(r"pneis", " penis ",text)
#     text = re.sub(r"pennnis", " penis ",text)
#     text = re.sub(r"pov.", " point of view ",text)
#     text = re.sub(r"vandalising", " vandalism ",text)
#     text = re.sub(r"cock", " dick ",text)
#     text = re.sub(r"youi", " you ",text)
#     text = re.sub(r"afd", " all fucking day ",text)
#     text = re.sub(r"sockpuppets", " sockpuppetry ",text)
#     text = re.sub(r"iiprick", " iprick ",text)
#     text = re.sub(r"penisi", " penis ",text)
#     text = re.sub(r"loil", " laughing out insanely loud ",text)
#     text = re.sub(r"ilol", " i lol ",text)
#     text = re.sub(r'\b[uU]\b', 'you',text)
##
#     text = re.sub(r'(fuckfuck)','fuck fuck ',text)
#     text = re.sub(r'(f+)( *)([u|*]+)( *)([c|*]+)( *)(k)+','fuck',text)
#     text = re.sub(r'(s+ *h+ *i+ *t+)','shit',text)
#     text = re.sub(r'([a|@][$|s][s|$])','ass',text)
#     text = re.sub(r'(\bfuk\b)','fuck',text)
#     text = re.sub(r'(\bpennnis\b)','penis',text)
#     text = re.sub(r'(\bfggt\b)','faggot',text)
#     text = re.sub(r'(\bfag\b)','faggot',text)

#     #replaces abreviations
#     text = re.sub(r"what's", "what is ",text)    
#     text = re.sub(r"\'ve", " have ",text)
#     text = re.sub(r"can't", "cannot ",text)
#     text = re.sub(r"n't", " not ",text)
#     text = re.sub(r"i'm", "i am ",text)
#     text = re.sub(r"\'re", " are ",text)
#     text = re.sub(r"\'ll", " will ",text)
#     text = re.sub(r"\'scuse", " excuse ",text)

#     #remove unicode strings
#     """ Removes unicode strings like "\u002c" and "x96" """
#     text = re.sub(r'(\\u[0-9A-Fa-f]+)',r' ', text)       
#     text = re.sub(r'[^\x00-\x7f]',r' ',text)

#     """ Replaces "@user" with "atUser" """
#     text = re.sub('@[^\s]+','atUser',text)
        
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

tqdm.pandas()
df_train['text'] = df_train['text'].progress_apply(text_cleaning)
df = df_train.copy()
df['y'].value_counts(normalize=True)
min_len = (df['y'] >= 0.1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len * 2, random_state=402, replace=True)
df = pd.concat([df[df['y'] >= 0.1], df_y0_undersample])
vec = TfidfVectorizer(analyzer='char_wb', max_df=0.7, min_df=1, ngram_range=(2, 5))
X = vec.fit_transform(df['text'])
vec2 = MeanEmbeddingVectorizer(glove_big)
vec2.fit(df['text'],df['y'])
X2 = vec2.transform(df['text'])
X = scipy.sparse.hstack([X, X2])
del X2
model = LinearSVR(max_iter=1000000)
model.fit(X, df['y'])
l_model = Ridge(alpha=1.0)
l_model.fit(X, df['y'])
s_model = Ridge(alpha=2.)
s_model.fit(X, df['y'])
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
tqdm.pandas()
df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])
X_less_toxic2 = vec2.transform(df_val['less_toxic'])
X_more_toxic2 = vec2.transform(df_val['more_toxic'])
X_less_toxic = scipy.sparse.hstack([X_less_toxic, X_less_toxic2])
X_more_toxic = scipy.sparse.hstack([X_more_toxic, X_more_toxic2])
del X_less_toxic2,X_more_toxic2
p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)
print(f'val : {(p1 < p2).mean()}')
p3 = l_model.predict(X_less_toxic)
p4 = l_model.predict(X_more_toxic)
print(f'val : {(p3 < p4).mean()}')
p5 = s_model.predict(X_less_toxic)
p6 = s_model.predict(X_more_toxic)
print(f'val : {(p5 < p6).mean()}')
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)
X_test = vec.transform(df_sub['text'])
X_test2 = vec2.transform(df_sub['text'])
X_test = scipy.sparse.hstack([X_test, X_test2])
del X_test2
p3 = model.predict(X_test)
p4 = l_model.predict(X_test)
p5 = s_model.predict(X_test)
df_sub['score'] = (p3 + p4 + p5) / 3.
df_sub['score'] = df_sub['score']
df_sub[['comment_id', 'score']].to_csv("submission3.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import re 
import scipy
from scipy import sparse

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from sklearn.svm import SVR

df_train = pd.read_csv("../input/jigsaw-regression-based-data/train_data_version2.csv")
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_train = df_train.drop_duplicates()
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    #remove multiple toxic words
#     text = re.sub(r"#ofc", " of fuckin course ",text)
#     text = re.sub(r"fggt", " faggot ",text)
#     text = re.sub(r"cuntbag", " cunt bag ",text)
#     text = re.sub(r"fartchina", " fart china ",text)    
#     text = re.sub(r"youi", " you i ",text)
#     text = re.sub(r"cunti", " cunt i ",text)
#     text = re.sub(r"sucki", " suck i ",text)
#     text = re.sub(r"pagedelete", " page delete ",text)
#     text = re.sub(r"offuck", " of fuck ",text)
#     text = re.sub(r"centraliststupid", " central ist stupid ",text)
#     text = re.sub(r"hitleri", " hitler i ",text)
#     text = re.sub(r"f u c k", " fuck ",text)
#     text = re.sub(r"bunksteve", " bunk steve ",text)
#     text = re.sub(r"sexsex", " sex ",text)
#     text = re.sub(r"youbollocks", " you bollocks ",text)
#     text = re.sub(r"mothjer", " mother ",text)
#     text = re.sub(r"cuntfranks", " cunt ",text)
#     text = re.sub(r"ullmann", " jewish ",text)
#     text = re.sub(r"mr.", " mister ",text)
#     text = re.sub(r"aidsaids", " aids ",text)
#     text = re.sub(r"njgw", " nigger ",text)
#     text = re.sub(r"administrator", " admin ",text)
#     text = re.sub(r"gamaliel", " jewish ",text)
#     text = re.sub(r"rvv", " vanadalism ",text)
#     text = re.sub(r"admins", " admin ",text)
#     text = re.sub(r"pensnsnniensnsn", " penis ",text)
#     text = re.sub(r"pneis", " penis ",text)
#     text = re.sub(r"pennnis", " penis ",text)
#     text = re.sub(r"pov.", " point of view ",text)
#     text = re.sub(r"vandalising", " vandalism ",text)
#     text = re.sub(r"cock", " dick ",text)
#     text = re.sub(r"youi", " you ",text)
#     text = re.sub(r"afd", " all fucking day ",text)
#     text = re.sub(r"sockpuppets", " sockpuppetry ",text)
#     text = re.sub(r"iiprick", " iprick ",text)
#     text = re.sub(r"penisi", " penis ",text)
#     text = re.sub(r"loil", " laughing out insanely loud ",text)
#     text = re.sub(r"ilol", " i lol ",text)
#     text = re.sub(r'\b[uU]\b', 'you',text)
##
#     text = re.sub(r'(fuckfuck)','fuck fuck ',text)
#     text = re.sub(r'(f+)( *)([u|*]+)( *)([c|*]+)( *)(k)+','fuck',text)
#     text = re.sub(r'(s+ *h+ *i+ *t+)','shit',text)
#     text = re.sub(r'([a|@][$|s][s|$])','ass',text)
#     text = re.sub(r'(\bfuk\b)','fuck',text)
#     text = re.sub(r'(\bpennnis\b)','penis',text)
#     text = re.sub(r'(\bfggt\b)','faggot',text)
#     text = re.sub(r'(\bfag\b)','faggot',text)

#     #replaces abreviations
#     text = re.sub(r"what's", "what is ",text)    
#     text = re.sub(r"\'ve", " have ",text)
#     text = re.sub(r"can't", "cannot ",text)
#     text = re.sub(r"n't", " not ",text)
#     text = re.sub(r"i'm", "i am ",text)
#     text = re.sub(r"\'re", " are ",text)
#     text = re.sub(r"\'ll", " will ",text)
#     text = re.sub(r"\'scuse", " excuse ",text)

#     #remove unicode strings
#     """ Removes unicode strings like "\u002c" and "x96" """
#     text = re.sub(r'(\\u[0-9A-Fa-f]+)',r' ', text)       
#     text = re.sub(r'[^\x00-\x7f]',r' ',text)

#     """ Replaces "@user" with "atUser" """
#     text = re.sub('@[^\s]+','atUser',text)
        
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

tqdm.pandas()
df_train['text'] = df_train['text'].progress_apply(text_cleaning)
df = df_train.copy()
df['y'].value_counts(normalize=True)
vec = TfidfVectorizer(analyzer='char_wb', max_df=0.7, min_df=1, ngram_range=(2, 5))
X = vec.fit_transform(df['text'])
vec2 = HashingVectorizer(analyzer='char_wb', ngram_range=(2, 5))
X2 = vec2.fit_transform(df['text'])
X = scipy.sparse.hstack([X, X2])
del X2
model = LinearSVR(max_iter=1000000)
model.fit(X, df['y'])
l_model = Ridge(alpha=1.0)
l_model.fit(X, df['y'])
s_model = Ridge(alpha=2.)
s_model.fit(X, df['y'])
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
tqdm.pandas()
df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])
X_less_toxic2 = vec2.transform(df_val['less_toxic'])
X_more_toxic2 = vec2.transform(df_val['more_toxic'])
X_less_toxic = scipy.sparse.hstack([X_less_toxic, X_less_toxic2])
X_more_toxic = scipy.sparse.hstack([X_more_toxic, X_more_toxic2])
del X_less_toxic2,X_more_toxic2
p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)
print(f'val : {(p1 < p2).mean()}')
p3 = l_model.predict(X_less_toxic)
p4 = l_model.predict(X_more_toxic)
print(f'val : {(p3 < p4).mean()}')
p5 = s_model.predict(X_less_toxic)
p6 = s_model.predict(X_more_toxic)
print(f'val : {(p5 < p6).mean()}')
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)
X_test = vec.transform(df_sub['text'])
X_test2 = vec2.transform(df_sub['text'])
X_test = scipy.sparse.hstack([X_test, X_test2])
del X_test2
p3 = model.predict(X_test)
p4 = l_model.predict(X_test)
p5 = s_model.predict(X_test)
df_sub['score'] = (p3 + p4 + p5) / 3.
df_sub['score'] = df_sub['score']
df_sub[['comment_id', 'score']].to_csv("submission33.csv", index=False)

In [None]:
data = pd.read_csv("./submission1.csv",index_col="comment_id")
data["score1"] = data["score"]

data["score2"] = pd.read_csv("./submission2.csv",index_col="comment_id")["score"]
data["score2"] = rankdata( data["score2"], method='ordinal')

data["score3"] = pd.read_csv("./submission3.csv",index_col="comment_id")["score"]
data["score3"] = rankdata( data["score3"], method='ordinal')

data["score33"] = pd.read_csv("./submission33.csv",index_col="comment_id")["score"]
data["score33"] = rankdata( data["score33"], method='ordinal')

data["score"] = .75*data["score1"] + .55*data["score2"] + (0.5*data["score3"]+0.5*data["score33"])*.75

In [None]:
data["score"] = rankdata( data["score"], method='ordinal')
data.head()

In [None]:
df_test = data
df_test["score"].to_csv('./submission.csv')

In [None]:
pd.read_csv("./submission.csv")