# Set Up

In [5]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve

In [6]:
# Create list of outputs required
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [24]:
# Binary Variables
read_train = False
read_test = False
vectorize = False
train = False
predict = False

## Read Data

In [27]:
# Read Training Data
if read_train:
    data_train = pd.read_csv('../data/train[1].csv')
    x_train_text = data_train['comment_text']

# Read Testing Data
if read_test:
    data_test = pd.read_csv('../')
    x_test_text = data_test['comment']

# Data Preprocessing

## Vectorize into n-grams

In [28]:
filename = 'log_reg_vectorizer.sav'
if vectorize:
    # Create Word Vectorizer which
    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w+',
        stop_words='english',
        ngram_range=(1, 2),
        max_features=10000)

    # Fit Vectorizer on Training Data
    word_vectorizer.fit(x_train_text)
    
    # Pickle vectorizer
    pickle.dump(word_vectorizer, open('../output/' + filename, 'wb'))

# Read Vectorizer
word_vectorizer = pickle.load(open('../output/' + filename, 'rb'))

# Model

In [12]:
if train:
    # Transform Training Data
    x_train = word_vectorizer.transform(x_train_text)
    
    scores = []
    for class_name in classes:
        train_label = x_data[class_name]
        classifier = LogisticRegression(C=0.1, solver='sag')

        # Get Cross validation Score
        cv_score = np.mean(cross_val_score(classifier, x_train, train_label, cv=3, scoring='roc_auc'))
        scores.append(cv_score)
        print(f"CV Score for {class_name} is {cv_score}")

        # Train Model and Pickle
        classifier.fit(x_train, train_label)
        filename = 'log_reg_' + class_name + '.sav'
        pickle.dump(classifier, open('../output/' + filename, 'wb'))

    print(f"Total CV Score is {np.mean(scores)}")
else:
    for class_name in classes:
        filename = 'log_reg_' + class_name + '.sav'
        model = pickle.load(open('../output/' + filename, 'rb'))
    

CV Score for toxic is 0.9573956404770797
CV Score for severe_toxic is 0.9839211903778793
CV Score for obscene is 0.9801799624643728
CV Score for threat is 0.976784987827222
CV Score for insult is 0.9693397777962905
CV Score for identity_hate is 0.967292834497192
Total CV Score is 0.9724857322400061


# Generate Output

## Transform Test Data with Vectorizer

In [None]:
def get_predictions(model, x_test, class_name):
    predictions = model.predict_proba(x_test)[:, 1]
    
    # Get False Positive Rate and True Positive Rate
    fpr, tpr, thresholds = roc_curve(x_test, predictions)
    
    # Get Optimal Threshold
    threshold = thresholds[np.argmax(tpr - fpr)]
    
    # Add Column with Predictions
    data_test[class_name] = np.where(predictions > threshold, 1, 0)
    

In [None]:
if predict:
    x_test = word_vectorizer.transform(x_test_text)