# Importing Libraries

In [21]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_union
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from scipy.sparse import vstack



# Loading Data

In [7]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')




# Data Preprocessing

In [8]:
# Remove Punctuation

def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

train['comment_text'] = train['comment_text'].apply(remove_punctuation)
test['comment_text'] = test['comment_text'].apply(remove_punctuation)

In [9]:
# Remove Stop Words

sw=stopwords.words('english')

def removesw(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)

train['comment_text'] = train['comment_text'].apply(removesw)
test['comment_text'] = test['comment_text'].apply(removesw)


In [10]:
# Applying Stemming

stemmer = SnowballStemmer("english")

def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 
train['comment_text'] = train['comment_text'].apply(stemming)
test['comment_text'] = test['comment_text'].apply(stemming)

In [11]:
# Get preprocessed train and text data and concat them together

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])



In [12]:
# Generating word and character TF-ID Vectorizer

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=30000)
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=30000)
vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs=2)



# Feature Extraction

In [13]:
# Generating features form the vectorizer object

vectorizer.fit(all_text)
train_features = vectorizer.transform(train_text)
test_features = vectorizer.transform(test_text)



# Calculate score "liblinear"

In [22]:
# Calculating Scores

scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='liblinear')

    cv_score = np.mean(cross_val_score(
        classifier, train_features, train_target, cv=3, scoring='accuracy'))
    scores.append(cv_score)
    print('Accuracy score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Accuracy score is {}'.format(np.mean(scores)))

Accuracy score for class toxic is 0.9599676650220644
Accuracy score for class severe_toxic is 0.9906123282075875
Accuracy score for class obscene is 0.9798584963751736
Accuracy score for class threat is 0.9971987395903144
Accuracy score for class insult is 0.9718119216134653
Accuracy score for class identity_hate is 0.9924046355158911
Accuracy score is 0.9819756310540827


# Calculate score "sag" with no penalty

In [24]:
# Calculating Scores

scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag', penalty = "none")

    cv_score = np.mean(cross_val_score(
        classifier, train_features, train_target, cv=3, scoring='accuracy'))
    scores.append(cv_score)
    print('Accuracy score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Accuracy score is {}'.format(np.mean(scores)))



Accuracy score for class toxic is 0.9537509973254942




Accuracy score for class severe_toxic is 0.9890832308610801




Accuracy score for class obscene is 0.9775585787416569




Accuracy score for class threat is 0.9971298046802467




Accuracy score for class insult is 0.9664726048650043




Accuracy score for class identity_hate is 0.991715287829028
Accuracy score is 0.9792850840504183




# Calculate score "sag" with "l2" penalty

In [16]:
# Calculating Scores

scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag', penalty = "l2")

    cv_score = np.mean(cross_val_score(
        classifier, train_features, train_target, cv=3, scoring='accuracy'))
    scores.append(cv_score)
    print('Accuracy score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Accuracy score is {}'.format(np.mean(scores)))

Accuracy score for class toxic is 0.9599551318090588
Accuracy score for class severe_toxic is 0.9905935276811727
Accuracy score for class obscene is 0.9798772970194062
Accuracy score for class threat is 0.9971924728659939
Accuracy score for class insult is 0.9718369889820182
Accuracy score for class identity_hate is 0.9924234360423059
Accuracy score is 0.9819798090666594


# Find max_iterations Hyperparameter

In [27]:
max_iterations = [8, 16, 32, 64, 128]

scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})

for i in max_iterations:
    
    for class_name in class_names:
        train_target = train[class_name]
        classifier = LogisticRegression(solver='sag', penalty = "l2", max_iter= i)

        cv_score = np.mean(cross_val_score(
            classifier, train_features, train_target, cv=3, scoring='accuracy'))
        scores.append(cv_score)
        print('Accuracy score for class {} is {}'.format(class_name, cv_score))

        classifier.fit(train_features, train_target)
        submission[class_name] = classifier.predict_proba(test_features)[:, 1]

    print('Accuracy score is {} for max iterations {}'.format(np.mean(scores), i))



Accuracy score for class toxic is 0.9599613975908375




Accuracy score for class severe_toxic is 0.9905997942876758




Accuracy score for class obscene is 0.9799148977187824




Accuracy score for class threat is 0.9971862060238555




Accuracy score for class insult is 0.9718933905612624




Accuracy score for class identity_hate is 0.9923921018316145




Accuracy score is 0.9819912980023381 for max iterations 8




Accuracy score for class toxic is 0.9599613986511971




Accuracy score for class severe_toxic is 0.9905935276811727




Accuracy score for class obscene is 0.9798835638615443




Accuracy score for class threat is 0.9971924728659939




Accuracy score for class insult is 0.9718432558241564




Accuracy score for class identity_hate is 0.9924171692001676




Accuracy score is 0.9819865980081884 for max iterations 16
Accuracy score for class toxic is 0.9599551318090588
Accuracy score for class severe_toxic is 0.9905935276811727
Accuracy score for class obscene is 0.9798772970194062
Accuracy score for class threat is 0.9971924728659939
Accuracy score for class insult is 0.9718369889820182
Accuracy score for class identity_hate is 0.9924234360423059
Accuracy score is 0.9819843350276787 for max iterations 32
Accuracy score for class toxic is 0.9599551318090588
Accuracy score for class severe_toxic is 0.9905935276811727
Accuracy score for class obscene is 0.9798772970194062
Accuracy score for class threat is 0.9971924728659939
Accuracy score for class insult is 0.9718307222576977
Accuracy score for class identity_hate is 0.9924234360423059
Accuracy score is 0.9819829424239105 for max iterations 64
Accuracy score for class toxic is 0.9599551318090588
Accuracy score for class severe_toxic is 0.9905935276811727
Accuracy score for class obscene is 

# Final Model

In [39]:
# Calculating Scores

training_scores = []
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag', penalty = "l2", max_iter = 8)

    cv_score = np.mean(cross_val_score(
        classifier, train_features, train_target, cv=3, scoring='accuracy'))
    scores.append(cv_score)
    classifier.fit(train_features, train_target)
    tr_score = np.mean(classifier.score(train_features, train_target))
    
    print('Training Accuracy score for class {} is {}'.format(class_name, cv_score))
    print('Validation Accuracy score for class {} is {}'.format(class_name, tr_score))
    scores.append(cv_score)
    training_scores.append(tr_score)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('\nTraining Accuracy score is {}'.format(np.mean(scores)))    
print('Validation Accuracy score is {}'.format(np.mean(scores)))



Training Accuracy score for class toxic is 0.9599363317539149
Validation Accuracy score for class toxic is 0.9688539897600441




Training Accuracy score for class severe_toxic is 0.9905496601396583
Validation Accuracy score for class severe_toxic is 0.9919220911067801




Training Accuracy score for class obscene is 0.9799023640345058
Validation Accuracy score for class obscene is 0.9841888563711452




Training Accuracy score for class threat is 0.9971987395903144
Validation Accuracy score for class threat is 0.9974932788539271




Training Accuracy score for class insult is 0.9718181885734213
Validation Accuracy score for class insult is 0.9773643080509616




Training Accuracy score for class identity_hate is 0.9923921017137968
Validation Accuracy score for class identity_hate is 0.9934637246116149

Training Accuracy score is 0.9819662309676018
Validation Accuracy score is 0.9819662309676018


# Testing on Kaggle

In [29]:
submission.to_csv('submission.csv', index=False)