# Using Multi-label Classification probabilities  (Logistic Regression + NB-SVM)

 This notebook uses the dataset from Jigsaw_toxic_comment_classification for training multi-label classifiers using techniques such as Binaey Relevance & Classifier Chains.
 For validation and test set predictions, the probability outputs of different classes are summed up to get the overall toxicity score which is used for submission.

Following two awesome notebooks from the Jigsaw_toxic_comment_classification are referred for this work, with minor adjustments for this competition.
-   _- Classifying multi-label comments_, https://www.kaggle.com/rhodiumbeng/classifying-multi-label-comments-0-9741-lb
-   _- NB-SVM strong linear baseline_, https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline/comments
            

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re

from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import gc




# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv')
val_severity_df = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
test_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
data = train_df.iloc[:,2:]

colormap = plt.cm.plasma
plt.figure(figsize=(7,7))
plt.title('Correlation of features & targets',y=1.05,size=14)
sns.heatmap(data.astype(float).corr(),linewidths=0.1,vmax=1.0,square=True,cmap=colormap,
           linecolor='white',annot=True)

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
train_df["comment_text"] = train_df["comment_text"].apply(lambda x: clean_text(x))

In [None]:
import re, string


# import and instantiate TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features = 5000, ngram_range=(1,2), stop_words="english")
vect

In [None]:
X_vec = vect.fit_transform(train_df["comment_text"])

In [None]:
X_vec.shape

In [None]:
target_labels = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]

In [None]:
def predict_val_set(model, vectorizer,df_val , process_text  = clean_text):
    df_val["less_toxic"] = df_val["less_toxic"].apply(lambda x: process_text(x))
    df_val["more_toxic"] = df_val["more_toxic"].apply(lambda x: process_text(x))
    less_toxic_vec = vectorizer.transform(df_val["less_toxic"])
    more_toxic_vec = vectorizer.transform(df_val["more_toxic"])
    
    less_toxic_scores = model.predict_proba(less_toxic_vec)
    more_toxic_scores = model.predict_proba(more_toxic_vec)
    
    return (less_toxic_scores, more_toxic_scores)


def predict_test_set(model, vectorizer,df_test , process_text  = clean_text):
    df_test["text"] = df_test["text"].apply(lambda x: process_text(x))
    toxic_vec = vectorizer.transform(df_test["text"])
    
    toxic_scores = model.predict_proba(toxic_vec)
    
    return (toxic_scores)

### Binary Relevance

In [None]:


def binary_relevance(model,model_name, X_vec, train_df, val_df, df_test, vectorizer):
    target_labels = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
    less_toxic_scores = defaultdict(list)
    more_toxic_scores = defaultdict(list)
    test_toxic_scores = defaultdict(list)
#     training_accuracies =[]
    for label in target_labels:
#         print('... Processing {}'.format(label))
        y = train_df[label].values
        # train the model using X_dtm & y
        model.fit(X_vec, y)
        # compute the training accuracy
#         y_pred_X = model.predict(X_vec)
#         print('Training accuracy is {}'.format(accuracy_score(y, y_pred_X)))
        # predict on val_set
        lt_s, mt_s ,  = predict_val_set(model = model, vectorizer = vectorizer ,df_val = val_df , process_text  = clean_text)
        test_s = predict_test_set(model = model, vectorizer = vectorizer ,df_test = df_test , process_text  = clean_text)
        less_toxic_scores[label] = lt_s[:,1]
        more_toxic_scores[label] = mt_s[:,1]
        test_toxic_scores[label] = test_s[:,1]
    br_less_toxic_final_score = pd.DataFrame(less_toxic_scores).sum(axis=1)
    br_more_toxic_final_score = pd.DataFrame(more_toxic_scores).sum(axis=1)
    br_test_toxic_final_score = pd.DataFrame(test_toxic_scores).sum(axis=1)

    val_accuracy = (br_less_toxic_final_score< br_more_toxic_final_score).mean()*100
    print("BR validation accuracy for model {}  is {}".format(model_name,val_accuracy))
     
    
    del less_toxic_scores, more_toxic_scores, val_accuracy, lt_s, mt_s, model
    _= gc.collect()
    return br_less_toxic_final_score, br_more_toxic_final_score, br_test_toxic_final_score



    

### Classifier Chains

In [None]:
def classifier_chains(model,model_name, X_vec, train_df, val_df,df_test,  vectorizer):
    print("running_classifier_chain")

    target_labels = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
    y = train_df[target_labels]
    classifier = ClassifierChain(model)

    classifier.fit(X_vec, y)
    y_pred_X = classifier.predict(X_vec)
    print('CC Training accuracy for model {} is {}'.format(model_name, accuracy_score(y, y_pred_X)))

    less_toxic_scores, more_toxic_scores = predict_val_set(model = classifier, vectorizer = vectorizer ,df_val = val_df , process_text  = clean_text)
    test_toxic_scores = predict_test_set(model = classifier, vectorizer = vectorizer,df_test= df_test , process_text  = clean_text)
    
    classifier_chain_less_toxic_final_score = pd.DataFrame(less_toxic_scores.toarray()).sum(axis=1)
    classifier_chain_more_toxic_final_score = pd.DataFrame(more_toxic_scores.toarray()).sum(axis=1)
    classifier_chain_test_toxic_final_score = pd.DataFrame(test_toxic_scores.toarray()).sum(axis=1)

    val_accuracy = (classifier_chain_less_toxic_final_score< classifier_chain_more_toxic_final_score).mean()*100
    print("CC validation accuracy for model {} is {}".format(model_name, val_accuracy))
    
    del less_toxic_scores, more_toxic_scores, val_accuracy, classifier
    gc.collect()
    
    return classifier_chain_less_toxic_final_score, classifier_chain_more_toxic_final_score, classifier_chain_test_toxic_final_score

## Combined_score

In [None]:
def combined_score(model,model_name, X_vec, train_df, val_df,df_test, vectorizer):
 
    br_less_toxic_final_score, br_more_toxic_final_score,br_test_toxic_final_score  = binary_relevance(model,model_name, X_vec, train_df, val_df, df_test, vectorizer)
    classifier_chain_less_toxic_final_score, classifier_chain_more_toxic_final_score, classifier_chain_test_toxic_final_score = classifier_chains(model,model_name, X_vec, train_df, val_df,df_test, vectorizer)
    
    combined_less_toxic_score = br_less_toxic_final_score + classifier_chain_less_toxic_final_score
    combined_more_toxic_score = br_more_toxic_final_score + classifier_chain_more_toxic_final_score
    combined_test_toxic_score = br_test_toxic_final_score + classifier_chain_test_toxic_final_score

    val_accuracy = (combined_less_toxic_score< combined_more_toxic_score).mean()*100
    print("Combined validation accuracy for model{} is {}".format(model_name, val_accuracy))
    
    del br_less_toxic_final_score, br_more_toxic_final_score, classifier_chain_less_toxic_final_score, classifier_chain_more_toxic_final_score
    gc.collect()
    
    return combined_less_toxic_score, combined_more_toxic_score, combined_test_toxic_score

In [None]:
results = pd.DataFrame()
results_test = pd.DataFrame()
models_to_train ={}

# models_to_train["SVM"] = SVC(probability=True)
# models_to_train["GaussianNB"] = GaussianNB()
models_to_train["LogisticRegression"] = LogisticRegression(C= 0.1, solver = "liblinear")

for model_name, model in models_to_train.items():
    print("---Training " + model_name + "-----")
    combined_less_toxic_score, combined_more_toxic_score, combined_test_toxic_score = combined_score(model,model_name, X_vec, train_df = train_df, val_df =val_severity_df , df_test = test_df, vectorizer = vect)
    results[model_name + "_less_toxic_score"] = combined_less_toxic_score
    results[model_name + "_more_toxic_score"]= combined_more_toxic_score
    results_test[model_name + "_test_toxic_score"] = combined_test_toxic_score


### NaiveBayes- SVM Model

In [None]:
def nbsvm_model(train_df, val_df, df_test, process_text= clean_text):
    
    re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    def tokenize(s): return re_tok.sub(r' \1 ', s).split()
    vectorizer = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1)
    X_vec = vectorizer.fit_transform(train_df["comment_text"])

    def pr(y_i, y,x):
        p = x[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)


    def get_mdl(y,x):
        y = y.values
        r = np.log(pr(1,y,x) / pr(0,y,x))
        m = LogisticRegression(C=4, dual=True, solver="liblinear")
        x_nb = x.multiply(r)
        return m.fit(x_nb, y), r
    
    target_labels = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
    
    val_df["less_toxic"] = val_df["less_toxic"].apply(lambda x: process_text(x))
    val_df["more_toxic"] = val_df["more_toxic"].apply(lambda x: process_text(x))
    df_test["text"] = df_test["text"].apply(lambda x: process_text(x))
    less_toxic_vec = vectorizer.transform(val_df["less_toxic"])
    more_toxic_vec = vectorizer.transform(val_df["more_toxic"])
    test_toxic_vec = vectorizer.transform(df_test["text"])
    
    less_toxic_scores = defaultdict(list)
    more_toxic_scores = defaultdict(list)
    test_toxic_scores = defaultdict(list)
    
    for i, j in enumerate(target_labels):
        print('fit', j)
        m,r = get_mdl(train_df[j], X_vec)
        
        less_toxic_scores[j] = m.predict_proba(less_toxic_vec.multiply(r))[:,1]
        more_toxic_scores[j]= m.predict_proba(more_toxic_vec.multiply(r))[:,1]
        test_toxic_scores[j]= m.predict_proba(test_toxic_vec.multiply(r))[:,1]

    nbsvm_less_toxic_final_score = pd.DataFrame(less_toxic_scores).sum(axis=1)
    nbsvm_more_toxic_final_score = pd.DataFrame(more_toxic_scores).sum(axis=1)
    nbsvm_test_toxic_final_score = pd.DataFrame(test_toxic_scores).sum(axis=1)

    val_accuracy = (nbsvm_less_toxic_final_score< nbsvm_more_toxic_final_score).mean()*100
    print("BR validation accuracy for NBSVM is {}".format(val_accuracy))
    
    del less_toxic_scores, more_toxic_scores, val_accuracy
    _= gc.collect()
    return nbsvm_less_toxic_final_score, nbsvm_more_toxic_final_score,nbsvm_test_toxic_final_score
    
    
    
    

In [None]:
results["NBsvm_less_toxic_score"], results["NBsvm_more_toxic_score"], results_test["NBsvm_test_toxic_score"] =  nbsvm_model(train_df, val_severity_df, test_df)



In [None]:
results.head()

In [None]:
### Combined Validation Score
(results.iloc[:,[0,2]].sum(axis=1) < results.iloc[:,[1,3]].sum(axis=1)).mean()

### Submit Predictions

In [None]:
# results_test["score"] = results_test.sum(axis=1)
# results_test

In [None]:
# submission = pd.DataFrame(zip(test_df["comment_id"], results_test["score"]), columns=["comment_id","score"])
# submission.to_csv("submission.csv", index=False)

In [None]:
### only nb-svm
score = results_test["NBsvm_test_toxic_score"]
submission = pd.DataFrame(zip(test_df["comment_id"], score), columns=["comment_id","score"])
submission.to_csv("submission.csv", index=False)