In [None]:
# Notebook Imports & Setup
from collections import Counter, defaultdict
from functools import partial
from tqdm.auto import tqdm
from pathlib import Path
from time import time
import pandas as pd
import numpy as np
import sklearn
import joblib
import re

from gensim.parsing.preprocessing import remove_stopwords

from sklearn.linear_model import Ridge, RidgeCV, RidgeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
import sklearn.pipeline
from sklearn.metrics import f1_score, jaccard_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier, Perceptron
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
from bs4 import BeautifulSoup
lancaster=LancasterStemmer()
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string
    text = remove_stopwords(text)
    token_words=word_tokenize(text)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(lancaster.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)
    

In [None]:
old_train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
old_train['y'] = 0
#for feat, wt in FEATURE_WTS.items(): 
#    old_train.y += wt*old_train[feat]
old_train['y'] = old_train.loc[:, 'toxic':'identity_hate'].sum(axis=1)
#old_train.y = old_train.y/old_train.y.max()
    
pos = old_train[old_train.y>0]
neg = old_train[old_train.y==0].sample(len(pos)//2, random_state=201)
old_train = pd.concat([pos, neg])
old_train

In [None]:
def read_old_test(): 
    df_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv')
    df_test_labels = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv')
    df = pd.merge(df_test, df_test_labels, how='left', on = 'id')
    df = df.replace(-1, np.nan).dropna()
    return df

old_test = read_old_test()
old_test['y'] = 0
#for feat, wt in FEATURE_WTS.items(): 
#    old_test.y += wt * old_test[feat]
#old_test.y = old_test.y / old_test.y.max()
old_test['y'] = old_test.loc[:, 'toxic':'identity_hate'].sum(axis=1)
old_test_pos = old_test[old_test.y>0]

train = pd.concat([old_train, old_test_pos])

In [None]:
train = train.drop('y', axis=1)
train

In [None]:
tqdm.pandas()
train.comment_text = train.comment_text.progress_apply(text_cleaning)
train

In [None]:
import sklearn.linear_model
import sklearn.pipeline

In [None]:
vec = TfidfVectorizer(
        min_df=3, max_df=0.5, 
        analyzer='char_wb', ngram_range = (3,5), 
        lowercase=True, max_features=50000,
    )

In [None]:
X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(train['comment_text'], train.loc[:, 'toxic':'identity_hate'],
                                    test_size=0.20,
                                     random_state=0
                                    )

X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)


In [None]:
def fit_model(model, params, cv=5, scoring='f1_weighted' ):
    model_gs = GridSearchCV(model, params, cv=cv, scoring=scoring)
    model_gs.fit(X_train, y_train)
    
    y_pred = model_gs.predict(X_test)
    y_true = y_test
    
    metrics_scored = [f1_score, jaccard_score, recall_score, precision_score ]
    
    scores = [accuracy_score(y_true,y_pred)]
    scores += [metric(y_true, y_pred,average='weighted') for metric in metrics_scored]
    
    
        
    return model_gs, scores, y_pred

In [None]:
FAST = True
if FAST:
    params = {
        'estimator__penalty' : ["l2"],
        'estimator__loss':['hinge'],
        'estimator__class_weight': [None],
        'estimator__n_jobs': [-1],
    }
else:
    params = {
        'estimator__penalty' : ["l1","l2","elasticnet"],
        'estimator__loss':['squared_hinge','log','hinge'],
        'estimator__class_weight': [None,"dict","balanced"],
        'estimator__n_jobs': [-1],
    }

model = SGDClassifier()
model1, scores, y_pred = fit_model(OneVsRestClassifier(model),params);
scores, model1.best_params_

In [None]:
FAST = True
if FAST:
    params = {
        'estimator__alpha':[4],
    }
else:
    params = {
        'estimator__alpha':[0.5,1,1.5,2,2.5,3,3.5,4,4.5],
    }

model = RidgeClassifier()


model2, scores, y_pred = fit_model(OneVsRestClassifier(model),params,cv=5, scoring='accuracy');
scores, model2.best_params_

In [None]:
sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
sub.text = sub.text.progress_apply(text_cleaning)
sub


In [None]:
FEATURE_WTS = {
    'toxic': 0.32,
    'severe_toxic': 1.5,
    'obscene': 0.16, 
    'threat': 1.5,
    'insult': 0.64,
    'identity_hate': 1.5
}
f = np.array(list(FEATURE_WTS.values()))
f

In [None]:
p1 = model1.decision_function(vec.transform(sub.text))
p2 = model2.decision_function(vec.transform(sub.text))
sub['score'] = (np.array([sum(row) for row in f*p1])+np.array([sum(row) for row in f*p2]))/2
sub

In [None]:
sub[['comment_id', 'score']].to_csv('submission.csv', index=False)