In [9]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from scipy.special import logit, expit

In [12]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('../Datasets/rawData/train.csv').fillna(' ')
test = pd.read_csv('../Datasets/rawData/test.csv').fillna(' ')


In [14]:
from sklearn.utils import resample

# Separate majority and minority classes
dataset_majority = train[train.toxic == 0]
dataset_minority = train[train.toxic == 1]
 
# # Upsample minority class
# dataset_minority_upsampled = resample(dataset_minority, 
#                                  replace = True,  # sample with replacement
#                                  n_samples = 144277,   # to match majority class
#                                  random_state = 123) # reproducible results
 
# # Combine majority class with upsampled minority class
# dataset_upsampled = pd.concat([dataset_majority, dataset_minority_upsampled])
 
# # Display new class counts
# dataset_upsampled.toxic.value_counts()

# Downsample majority class
dataset_majority_downsampled = resample(dataset_majority, replace = True, n_samples = len(dataset_minority),
                                        random_state = 123)

# Combine two classes
dataset_ds = pd.concat([dataset_minority, dataset_majority_downsampled])

# Shuffle dataset
dataset_ds = dataset_ds.sample(frac=1).reset_index(drop=True)

In [27]:
train_text = dataset_ds
train_lables = train_text
train_text = train_text['comment_text']
test_text = test['comment_text']

In [28]:
train_text.head()

0    Hi. Yes, I'd noticed. It's always gratifing to...
1    Bodish peoples?\r\nAre there Bodish PEOPLES? I...
2                REDIRECT Talk:George H. D. Gossip/GA1
3                       It's joke? What the hell test?
4    November 2006 (UTC)\r\n\r\n Now listen here yo...
Name: comment_text, dtype: object

In [29]:
all_text = pd.concat([train_text, test_text])

In [30]:
# word vectorizer
word_vectorizer = CountVectorizer(stop_words = 'english',analyzer='word')
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [31]:
# character vectorizer
char_vectorizer = CountVectorizer(stop_words = 'english',analyzer='char')
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [32]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [33]:
losses = []
predictions = {'id': test['id']}
for class_name in class_names:
    train_target = train_lables[class_name]
    classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features=1000, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

    cv_loss = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='f1_micro'))
    losses.append(cv_loss)
    print('CV score for class {} is {}'.format(class_name, cv_loss))

    classifier.fit(train_features, train_target)
    predictions[class_name] = expit(logit(classifier.predict_proba(test_features)[:, 1]))

print('Total CV score is {}'.format(np.mean(losses)))

CV score for class toxic is 0.7725578658297372
CV score for class severe_toxic is 0.9482149731126538
CV score for class obscene is 0.812507870514537
CV score for class threat is 0.9851902706943899
CV score for class insult is 0.7869425918660912
CV score for class identity_hate is 0.9571727504145446
Total CV score is 0.8770977204053256
