In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import rankdata
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
train_data = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
test_data = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
test_label = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv").replace(-1,0)

In [None]:
test_data = pd.merge(test_data, test_label, how = 'left', on = 'id')
total_data = pd.concat([train_data, test_data])
total_data.sample(10)

In [None]:
total_data['severe_toxic'] = total_data.severe_toxic * 2
total_data['y'] = (total_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)).astype(int)
total_data['y'] = total_data['y']/total_data['y'].max()
total_data = total_data[['comment_text', 'y']].rename(columns={'comment_text': 'text'})

In [None]:
total_data.sample(10)

In [None]:
total_data['y'].value_counts()

In [None]:
# sample = len(total_data[total_data.y>0])
# total_data_undersample = total_data[total_data['y'] == 0].sample(n=sample, random_state=101)
# comment_df = pd.concat([total_data[total_data['y'] > 0], total_data_undersample])
comment_df = total_data
comment_df

In [None]:
X = comment_df['text']
y = comment_df['y']

In [None]:
def text_process(text):
    """
        1. remove punctuation from message
        2. remove stopwords from message
        3. return clean message
    """
    nopunc = re.sub(r'[^\w\s]', '', text)
    nopunc = nopunc.replace('\n'," ")
    nopunc = nopunc.replace('\t'," ")
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    
    return pattern.sub('', nopunc).lower().split()

In [None]:
pipeline = Pipeline([
    ('bow', TfidfVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('regressor', Ridge(alpha=1.0))
])

In [None]:
pipeline.fit(X, y)

In [None]:
comment_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
comment_val.sample(10)

In [None]:
comment_val['less_toxic_score'] = pipeline.predict(comment_val['less_toxic'])
comment_val['more_toxic_score'] = pipeline.predict(comment_val['more_toxic'])
comment_val.sample(10)

In [None]:
comment_val[comment_val['less_toxic_score']>comment_val['more_toxic_score']].count()

In [None]:
submission_df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
submission_df.sample(10)

In [None]:
score = pipeline.predict(submission_df['text'])
submission_df['score'] = rankdata(score, method='ordinal')
submission_df.sample(10)

In [None]:
submission_df[['comment_id', 'score']].to_csv("submission.csv", index=False)