# Set Up

In [3]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve

In [6]:
# Create list of outputs required
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [1]:
# Binary Variables
read_train = False
read_test = False
vectorize = False
cross_validate = False
train = True
predict = False

## Read Data

In [4]:
# Read Training Data
if read_train:
    data_train = pd.read_csv('../data/train[1].csv')
    x_train_text = data_train['comment_text']

# Read Testing Data
if read_test:
    data_test = pd.read_csv('../')
    x_test_text = data_test['comment']

# Data Preprocessing

## Vectorize into n-grams

In [28]:
filename = 'log_reg_vectorizer.sav'
if vectorize:
    # Create Word Vectorizer which
    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w+',
        stop_words='english',
        ngram_range=(1, 2),
        max_features=10000)

    # Fit Vectorizer on Training Data
    word_vectorizer.fit(x_train_text)
    
    # Pickle vectorizer
    pickle.dump(word_vectorizer, open('../output/' + filename, 'wb'))

# Read Vectorizer
word_vectorizer = pickle.load(open('../output/' + filename, 'rb'))

# Model

## Setup

In [None]:
# Transform Training Data
x_train = word_vectorizer.transform(x_train_text)

## Cross Validate

In [37]:
if cross_validate:
    
    scores = []
    for class_name in classes:
        train_label = x_data[class_name]
        classifier = LogisticRegression(C=0.1, solver='sag')

        # Get Cross validation Score
        cv_score = np.mean(cross_val_score(classifier, x_train, train_label, cv=3, scoring='roc_auc'))
        scores.append(cv_score)
        print(f"CV Score for {class_name} is {cv_score}")

    print(f"Total CV Score is {np.mean(scores)}")

CV Score for toxic is 0.9573954678103237
CV Score for severe_toxic is 0.9839209820637387
CV Score for obscene is 0.9801799483619518
CV Score for threat is 0.9767850278536501
CV Score for insult is 0.9693399811512836
CV Score for identity_hate is 0.9672932669757538
Total CV Score is 0.9724857790361171


## Train

In [42]:
def get_optimal_threshold(model, x_train, train_label):
        # Get False Positive Rate and True Positive Rate
        predictions = model.predict_proba(x_train)[:, 1]
        fpr, tpr, thresholds = roc_curve(train_label, predictions)
        
        # Get Optimal Threshold (TPR - FPR)
        return thresholds[np.argmax(tpr - fpr)]

In [43]:
if train:
    # Transform Training Data
    x_train = word_vectorizer.transform(x_train_text)
    
    for class_name in classes:
        train_label = x_data[class_name]
        classifier = LogisticRegression(C=0.1, solver='sag')

        # Train Model
        classifier.fit(x_train, train_label)
    
        # Get Optimal Threshold based on False Positive Rate and True Positive Rate
        threshold = get_optimal_threshold(classifier, x_train, train_label)
        
        # Pickle
        filename = 'log_reg_' + class_name + '.sav'
        pickle.dump((classifier, threshold), open('../output/' + filename, 'wb'))
    

# Generate Output

## Transform Test Data with Vectorizer

In [None]:
def get_predictions(model, x_test, class_name):
    predictions = model.predict_proba(x_test)[:, 1]
    
    # Get False Positive Rate and True Positive Rate
    fpr, tpr, thresholds = roc_curve(x_test, predictions)
    
    # Get Optimal Threshold
    threshold = thresholds[np.argmax(tpr - fpr)]
    
    # Add Column with Predictions
    data_test[class_name] = np.where(predictions > threshold, 1, 0)
    

In [None]:
if predict:
    x_test = word_vectorizer.transform(x_test_text)