In [44]:
import pandas as pd
import numpy as np
import gc
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_union
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc, accuracy_score

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
lemmatizer = WordNetLemmatizer()

%matplotlib inline


In [2]:
dftrain = pd.read_csv('datasources/jigsaw1/train.csv')
dftest = pd.read_csv('datasources/jigsaw1/test.csv')

In [3]:
dftrain.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [4]:
dftest.columns

Index(['id', 'comment_text'], dtype='object')

In [5]:
dftrain.shape, dftest.shape

((159571, 8), (153164, 2))

In [6]:
def clean_comments(df):
    comments = []
    for cmt in tqdm(df['comment_text']):
        #suppression des espace et caractères non-alphabetic characters
        comment_text = re.sub("[^a-zA-Z]"," ", cmt)
        comment_text = re.sub(' +', ' ', comment_text.strip())

        comments.append(comment_text)

    return(comments)



In [41]:
sample1 = dftrain.sample(1)
print(sample1['comment_text'])

152567    "\n\nThanks for the heads up, it looks like th...
Name: comment_text, dtype: object


In [42]:
sample_clean1 = clean_comments(sample1)

100%|██████████| 1/1 [00:00<00:00, 1792.44it/s]


In [43]:
sample_clean1

['Thanks for the heads up it looks like the mass of the usual suspects have rushed in to create a faux consensus without regard to the facts The talking point of He is disagreeing with the author was just too sweet for them to resist using it People are gullible Try not to get too upset over it though']

In [10]:
#clean comments for both train and test set
train_comments = clean_comments(df_train)
test_comments = clean_comments(df_test)

100%|██████████| 159571/159571 [00:07<00:00, 21913.29it/s]
100%|██████████| 153164/153164 [00:06<00:00, 23347.16it/s]


In [11]:

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words= 'english',
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 4),
    max_features=40000)

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=40000)
#TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char', stop_words= 'english',ngram_range=(3,6),dtype=np.float32)



In [12]:
vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs=3)

In [13]:
train_comments = dftrain['comment_text']
test_comments = dftest['comment_text']

vectorizer.fit(train_comments)

train_features = vectorizer.transform(train_comments)
test_features = vectorizer.transform(test_comments)

In [14]:
scores = []
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
submission = pd.DataFrame.from_dict({'id': dftest['id']})

for class_name in class_names:
    train_target = dftrain[class_name]
    classifier = LogisticRegression(C=1, solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission1.csv', index=False)

CV score for class toxic is 0.9796866512709563
CV score for class severe_toxic is 0.9887753512109676
CV score for class obscene is 0.9910432329765682
CV score for class threat is 0.990147725384898
CV score for class insult is 0.9834316469666746
CV score for class identity_hate is 0.9837513141375329
Total CV score is 0.9861393203245995


In [18]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [17]:
from sklearn.model_selection import train_test_split

scores = []
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
submission = pd.DataFrame.from_dict({'id': dftest['id']})

for class_name in class_names:
    train_target = dftrain[class_name]
    X_train, X_val, y_train, y_val = train_test_split(train_features, train_target, train_size = 0.75)
    classifier = LogisticRegression(C=1, solver='sag')
    cv_score = np.mean(cross_val_score(classifier, X_train, y_train, cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission2.csv', index=False)



CV score for class toxic is 0.97845483273562




CV score for class severe_toxic is 0.9888439507999822




CV score for class obscene is 0.9903317304017488




CV score for class threat is 0.9895636818644343




CV score for class insult is 0.9821356827110733




CV score for class identity_hate is 0.982305211687143
Total CV score is 0.9852725150333335


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

scores = []
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
submission = pd.DataFrame.from_dict({'id': dftest['id']})

for class_name in class_names:
    train_target = dftrain[class_name]
    X_train, X_val, y_train, y_val = train_test_split(train_features, train_target, train_size = 0.75)
    classifier = CalibratedClassifierCV(base_estimator=LinearSVC(penalty='l2', dual=False), cv=5)
    cv_score = np.mean(cross_val_score(classifier, X_train, y_train, cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

submission.to_csv('submission3.csv', index=False)



CV score for class toxic is 0.9742409788206803


IndexError: too many indices for array