In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re 
import scipy
from scipy import sparse
from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 
import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge
import zipfile
import string
import nltk
import string
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer() 

data collecting

In [None]:
train_csv_zip_path = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'
with zipfile.ZipFile(train_csv_zip_path) as zf:
    zf.extractall('./')


In [None]:
train_csv_path = './train.csv'
sample_sub_path = '../input/jigsaw-toxic-severity-rating/sample_submission.csv'
comments_to_score_path = '../input/jigsaw-toxic-severity-rating/comments_to_score.csv'
val_path='../input/jigsaw-toxic-severity-rating/validation_data.csv'

In [None]:
df_train = pd.read_csv("./train.csv")
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
df_train.head()

data preprocessing

follow the instructions: https://medium.com/analytics-vidhya/text-cleaning-in-natural-language-processing-nlp-bea2c27035a6

Data Preprocessing must include the follows:

Removing HTML characters,ASCII

Convert Text to Lowercase

Remove Punctuation's

Remove Stop words

Tokenization

Stemming vs Lemmatization


In [None]:
def clean_text(text):
#replace the html characters with " "
    text=re.sub('<.*?>', ' ', text)  
#remove the punctuations
    text = text.translate(str.maketrans(' ',' ',string.punctuation))
#consider only alphabets and numerics
    text = re.sub('[^a-zA-Z]',' ',text)  
#replace newline with space
    text = re.sub("\n"," ",text)
#convert to lower case
    text = text.lower()
#split and join the words
    text=' '.join(text.split())
    return text

def stopwords(input_text, stop_words):
    word_tokens = word_tokenize(input_text) 
    output_text = [w for w in word_tokens if not w in stop_words]
    output = [] 
    for w in word_tokens: 
        if w not in stop_words:
            output.append(w)
            
    text = ' '.join(output)
    return text



In [None]:
unrelevant_words = ['wiki','wikipedia','page']
#Clean step 1, 2 and 3
df_train['comment_text'] = df_train['comment_text'].apply(lambda x: ''.join([w for w in clean_text(x) if w not in unrelevant_words]))

#Clean Step 4
df_train['comment_text'] = df_train['comment_text'].apply(lambda x: ''.join([w for w in stopwords(x,stop_words)]))

#Clean Step 5
df_train['comment_text'] = df_train['comment_text'].apply(lambda x: ''.join([w for w in lemmatizer.lemmatize(x)]))

df_train.head()

reference:

https://medium.com/analytics-vidhya/text-cleaning-in-natural-language-processing-nlp-bea2c27035a6

In [None]:
# Create a score that messure how much toxic is a comment
random_score = {'obscene': 0.20, 'toxic': 0.40, 'threat': 0.6, 
            'insult': 0.65, 'severe_toxic': 0.9, 'identity_hate': 0.9}

for category in random_score:
    df_train[category] = df_train[category] * random_score[category]

df_train['score'] = df_train.loc[:, 'toxic':'identity_hate'].mean(axis=1)
df_train['y'] = df_train['score']

min_len = (df_train['y'] > 0).sum()  # len of toxic comments
df_non_tox = df_train[df_train['y'] == 0].sample(n=min_len, random_state=201)  # take non toxic comments
df_train_new = pd.concat([df_train[df_train['y'] > 0], df_non_tox])  # make new df
df_train_new.head()


In [None]:
n_samples_toxic = len(df_train[df_train['score'] != 0])
n_samples_normal = len(df_train) - n_samples_toxic

idx_to_drop = df_train[df_train['score'] == 0].index[n_samples_toxic//5:]
df_train = df_train.drop(idx_to_drop)

print(f'Reduced number of neutral text samples from {n_samples_normal} to {n_samples_toxic//5}.')
print(f'Total number of training samples: {len(df_train)}')

In [None]:
df_tragets = pd.DataFrame(pd.unique(df_train['score'].values), columns=['target_value']).sort_values(by='target_value', ascending = True).reset_index(drop=True)
THRESHOLD = df_tragets['target_value'].quantile(q=0.2)
df_train['sentiment'] = df_train['score'].map(lambda x: 1 if x < THRESHOLD else 2 if x < THRESHOLD*2 else 3 if x < THRESHOLD*3 else 4 if x < THRESHOLD*4 else 5)

df_train = df_train[['comment_text','sentiment']].reset_index(drop=True)
df_train

In [None]:
tf_idf_vect = TfidfVectorizer(analyzer='word',stop_words= 'english')
X = tf_idf_vect.fit_transform(df_train['comment_text']).toarray()
X

In [None]:
df_test = pd.read_csv(comments_to_score_path)

#Clean step 1, 2 and 3
df_test['text'] = df_test['text'].apply(lambda x: ''.join([w for w in clean_text(x) if w not in unrelevant_words]))

#Clean Step 4
df_test['text'] = df_test['text'].apply(lambda x: ''.join([w for w in stopwords(x,stop_words)]))

#Clean Step 5
df_test['text'] = df_test['text'].apply(lambda x: ''.join([w for w in lemmatizer.lemmatize(x)]))

df_test.head(3)

In [None]:
tf_idf_vect = TfidfVectorizer(analyzer='word',stop_words= 'english')
Y = tf_idf_vect.fit_transform(df_test['text']).toarray()
Y

In [None]:
score=[]
for i in range(len(df_train['sentiment'])): 
    score.append(df_train['sentiment'][i])

In [None]:
# Define initial best params and MAE\
from numpy import arange

params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    #'objective':'reg:linear',
}
gridsearch_params = [
    (max_depth, min_child_weight, eta)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
    for eta in arange(0.1,1,0.1)
]
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
dtrain = xgb.DMatrix(X,score)
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight, eta in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}, eta={}".format(
                             max_depth,
                             min_child_weight,
                             eta ))    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    params['eta'] = eta
 # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        #num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=5
    )    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight, eta)
        print("Best params: {}, {}, {}, MAE: {}".format(best_params[0], best_params[1],best_params[2], min_mae))