# Set Up

In [15]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [16]:
# Create list of outputs required
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [17]:
# Binary Variables
vectorize = False
cross_validate = False
train = True
validate = True
predict = False

## Read Data

In [18]:
# Read Training Data
data_train = pd.read_csv('../data/train[1].csv')
x_train_text = data_train['comment_text']

# Read Validation Data
data_val = pd.read_csv('../data/labeled data/combined.csv')
x_val_text = data_val['text']
    
# Read Testing Data
# data_test = pd.read_csv('../')
# x_test_text = data_test['comment']

# Data Preprocessing

## Vectorize into n-grams

In [19]:
filename = 'log_reg_SVM_vectorizer.sav'
if vectorize:
    # Create Word Vectorizer which
    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w+',
        stop_words='english',
        ngram_range=(1, 2),
        max_features=10000)

    # Fit Vectorizer on Training Data
    word_vectorizer.fit(x_train_text)
    
    # Pickle vectorizer
    pickle.dump(word_vectorizer, open('../output/' + filename, 'wb'))

# Read Vectorizer
word_vectorizer = pickle.load(open('../output/' + filename, 'rb'))



# Model

## Setup

## Cross Validate

In [20]:
if cross_validate:
    
    lin_reg_scores = []
    print("Calculating CV Scores for Logistic Regression")
    for class_name in classes:
        train_label = data_train[class_name]
        classifier = LogisticRegression(C=0.1, solver='sag')
        
        # Get Cross validation Scores for Logistic Regression
        cv_score = np.mean(cross_val_score(classifier, x_train, train_label, cv=3, scoring='roc_auc'))
        lin_reg_scores.append(cv_score)
        print(f"CV Score for {class_name} is {cv_score}")

    print(f"Total CV Score is {np.mean(lin_reg_scores)}")
    
    svm_scores = []
    print("Calculating CV Scores for SVM")
    for class_name in classes:
        train_label = data_train[class_name]
        svm = LinearSVC()
        classifier = CalibratedClassifierCV(svm)

        # Get Cross validation Scores for SVM
        cv_score = np.mean(cross_val_score(classifier, x_train, train_label, cv=3, scoring='roc_auc'))
        svm_scores.append(cv_score)
        print(f"CV Score for {class_name} is {cv_score}")

    print(f"Total CV Score is {np.mean(svm_scores)}")

## Train

In [21]:
def get_optimal_threshold(model, x_train, train_label):
        # Get False Positive Rate and True Positive Rate
        predictions = model.predict_proba(x_train)[:, 1]
        fpr, tpr, thresholds = roc_curve(train_label, predictions)
        
        # Get Optimal Threshold (TPR - FPR)
        return thresholds[np.argmax(tpr - fpr)]

In [22]:
if train:
    # Transform Training Data
    x_train = word_vectorizer.transform(x_train_text)
    
    # Logistic Regression
    print("Training Log Reg!")
    for class_name in classes:
        print(f"Training on class {class_name}")
        train_label = data_train[class_name]
        classifier = LogisticRegression(C=0.1, solver='sag')

        # Train Model
        classifier.fit(x_train, train_label)
    
        # Get Optimal Threshold based on False Positive Rate and True Positive Rate
        #threshold = get_optimal_threshold(classifier, x_train, train_label)
        
        # Pickle
        filename = 'log_reg_' + class_name + '.sav'
        pickle.dump(classifier, open('../output/' + filename, 'wb'))
        
    # SVM
    print("Training SVM!")
    for class_name in classes:
        print(f"Training on class {class_name}")
        train_label = data_train[class_name]
        svm = LinearSVC()
        classifier = CalibratedClassifierCV(svm)

        # Train Model
        classifier.fit(x_train, train_label)
    
        # Get Optimal Threshold based on False Positive Rate and True Positive Rate
        #threshold = get_optimal_threshold(classifier, x_train, train_label)

        # Pickle
        filename = 'SVM_' + class_name + '.sav'
        pickle.dump(classifier, open('../output/' + filename, 'wb'))
    

Training SVM!
Training on class toxic
Training on class severe_toxic
Training on class obscene
Training on class threat
Training on class insult
Training on class identity_hate


# Get Validation Score

In [23]:
def get_auc(model, x_val, y_val, class_name):
    y_pred = model.predict_proba(x_val)
    val_auc = roc_auc_score(y_val, y_pred[:, 1])
    
    print(f"The validation auc for {class_name} is {val_auc}")
    return val_auc

In [24]:
if validate:
    x_val = word_vectorizer.transform(x_val_text.values.astype('U'))
    
    print("Getting validation auc score for Logistic Regression")
    lr_aucs = []
    for class_name in classes:
            filename = 'log_reg_' + class_name + '.sav'
            model= pickle.load(open('../output/' + filename, 'rb'))
            
            val_auc = get_auc(model, x_val, data_val[class_name], class_name)
            lr_aucs.append(val_auc)
            
    print("Getting validation auc score for SVM")
    svm_aucs = []
    for class_name in classes:

        filename = 'svm_' + class_name + '.sav'
        model= pickle.load(open('../output/' + filename, 'rb'))

        val_auc = get_auc(model, x_val, data_val[class_name], class_name)
        svm_aucs.append(val_auc)
            

Getting validation auc score for Logistic Regression
The validation auc for toxic is 0.9352673071265474
The validation auc for severe_toxic is 0.47568854131247873
The validation auc for obscene is 0.9275733840152728
The validation auc for threat is 0.8198482932996207
The validation auc for insult is 0.916567887370807
The validation auc for identity_hate is 0.8435272975150276
Getting validation auc score for SVM
The validation auc for toxic is 0.9328667026072881
The validation auc for severe_toxic is 0.8054233253995239
The validation auc for obscene is 0.9241321523986196
The validation auc for threat is 0.7508022950500827
The validation auc for insult is 0.917321729365525
The validation auc for identity_hate is 0.8360290016731735


In [26]:
model_results = pd.read_csv('./model_results.csv', index_col='Unnamed: 0')
model_results = pd.concat([model_results, pd.DataFrame({'Model': ['Logistic Regression'], 'val_auc_score': [np.mean(lr_aucs)]})])
model_results = pd.concat([model_results, pd.DataFrame({'Model': ['SVM'], 'val_auc_score': [np.mean(svm_aucs)]})])
model_results.to_csv('./model_results.csv')