In [None]:
import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from scipy.stats import rankdata

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Text cleaning 

In [None]:
def preprocess_text(text, lower_case=True, clean_text=True):
    
    if lower_case:
        text = text.lower()
    
    # Remove website links
    template = re.compile(r'https?://\S+|www\.\S+') 
    text = template.sub(r'', text)
    
    # Remove HTML tags
    template = re.compile(r'<[^>]*>') 
    text = template.sub(r'', text)
    
    # Remove none ascii characters
    template = re.compile(r'[^\x00-\x7E]+') 
    text = template.sub(r'', text)
    
    # Replace none printable characters
    template = re.compile(r'[\x00-\x0F]+') 
    text = template.sub(r' ', text)
    
    if clean_text:
        # Replace shortenings 
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"don't", "do not ", text)
        text = re.sub(r"n't", "n not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        # Remove special characters
        template = re.compile('["#$%&\'()\*\+-/:;<=>@\[\]\\\\^_`{|}~]') 
        text = template.sub(r' ', text)
        # Replace multiple punctuation 
        text = re.sub('[.!?]{2,}', '.', text)
        text = re.sub(',+', ',', text) 
        # Remove numbers
        text = re.sub('\d+', ' ', text) 
        # Remove extra spaces
        text = re.sub('\s+', ' ', text)
    
    # Remove spaces at the beginning and at the end of string
    text = text.strip() 

    return text

# Validation

In [None]:
valid_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv")

def validate_model(model, valid_df=valid_df, clean_text=True):
    # pre-process the comments in valid_df
    for col_name in ["less_toxic", "more_toxic"]:
        valid_df[col_name] = valid_df[col_name].map(lambda com : preprocess_text(com, clean_text=clean_text))
    # predict
    if isinstance(model,  Pipeline):
        pred_less = model.predict(valid_df["less_toxic"])
        pred_more = model.predict(valid_df["more_toxic"])
    # compare
    return np.mean(pred_less < pred_more)

# Load first Kaggle competition dataset

In [None]:
# load train data 
train1_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print("Number of training exaples:", train1_df.shape[0])

# load test data
test_lbl_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv")
test_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
test_df = test_df.merge(test_lbl_df)
print("Number of testing exaples:", test_df.shape[0])

# concat datasets and drop rows without toxicity rating 
train1_df = pd.concat([test_df, train1_df], ignore_index=True)
train1_df.drop(columns=["id"], inplace=True)
train1_df = train1_df[train1_df.toxic >= 0]
train1_df.reset_index(inplace=True,drop=True) 
print("Final number of training exaples:", train1_df.shape[0])

train1_df.describe().iloc[:3,]

## Make target value 

In [None]:
# Lets look at the distribution of toxic and sever-toxic comments
toxic_count = train1_df.groupby(["toxic", "severe_toxic"]).count()
toxic_count = toxic_count.assign(prec = np.round(toxic_count.comment_text / sum(toxic_count.comment_text)*100,2))
toxic_count = toxic_count[["comment_text", "prec"]]
toxic_count.columns = ["count", "precentage"]
toxic_count

In [None]:
# Look at the distribution of the other toxicity indicators 
train1_df['toxic_ind'] = train1_df.obscene + train1_df.threat + train1_df.insult + train1_df.identity_hate
toxic_count = train1_df.groupby(["toxic", "severe_toxic", "toxic_ind"]).count()
toxic_count = toxic_count.assign(prec = np.round(toxic_count.comment_text / sum(toxic_count.comment_text)*100,2))
toxic_count = toxic_count[["comment_text", "prec"]]
toxic_count.columns = ["count", "precentage"]
toxic_count

In [None]:
# Define the final toxicity indicator
train1_df['toxic_ind'] = (train1_df.obscene + train1_df.threat + 
                          train1_df.insult + train1_df.identity_hate
                          ).map(lambda x: min(x,3))


In [None]:
# set weights for "toxic", "severe_toxic", "toxic_ind"
w_toxic = [1, 1, 0.25]

# Define the toxicity score
train1_df = train1_df.assign(y =  w_toxic[0] * train1_df.toxic + 
                             w_toxic[1] * train1_df.severe_toxic +
                             w_toxic[2] * train1_df.toxic_ind )
train1_df["y"] = train1_df["y"] / max(train1_df['y'])

train1_df.loc[train1_df["y"]>0, "y"].hist(bins=50)

# Define tf-idf + regression model

In [None]:
def run_tfidf_model(train_df, max_features=5000, alphas=[0.5, 1, 2]):

    # clean the comment_text in train_df 
    train_df['comment_text'] = train_df['comment_text'].map(lambda x: preprocess_text(x))

    # make model
    print('Vectorization\n')
    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english') #, min_df= 1e-3,  max_df=0., analyzer = 'char_wb', ngram_range = (3,5)
    regr = RidgeCV(alphas=alphas)
    model = Pipeline(
        [
            ("vectorizer", vectorizer),
            ("regr", regr),
        ])
    
    # Fit the model
    print('Fit Model\n')
    model.fit(train_df['comment_text'], train_df['y'])
    
    # Validate the model
    print('Validate Model')
    right_order_pred = validate_model(model)
    print('Correctly ordered sentences in the validation data:', np.round(right_order_pred*100, 3), '%\n' )
    
    return model

In [None]:
%time tfidf_model = run_tfidf_model(train1_df)

In [None]:
tfidf_model["regr"].alpha_

# Make submission

In [None]:
comments_to_score = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")

test_score = tfidf_model.predict(comments_to_score["text"])
comments_to_score["score"] = rankdata(test_score, method="ordinal")
comments_to_score[["comment_id","score"]].to_csv("submission.csv", index=False)