# Set Up

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [36]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## Read Data

In [40]:
x_data = pd.read_csv('./data/train[1].csv')
x_text = x_data['comment_text']

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Data Preprocessing

## Vectorize into n-grams

In [39]:
x_data['comment_text']

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [41]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w+',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=10000)

word_vectorizer.fit(x_text)
x_train = word_vectorizer.transform(x_text)

# Model

In [42]:
scores = []
for class_name in classes:
    train_label = x_data[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')
    
    cv_score = np.mean(cross_val_score(classifier, x_train, train_label, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    
    print(f"CV Score for {class_name} is {cv_score}")
    
print(f"Total CV Score is {np.mean(scores)}")
    

CV Score for toxic is 0.9573953644828087
CV Score for severe_toxic is 0.9839236949600361
CV Score for obscene is 0.9801797650788608
CV Score for threat is 0.9767842372081564
CV Score for insult is 0.9693402750473571
CV Score for identity_hate is 0.9672928885930184
Total CV Score is 0.9724860375617063
