In [1]:
#geral
import pandas as pd
import numpy as np

#sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#visual
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/all_data.csv')

In [3]:
df.dropna(subset=['comment_text', 'toxicity'], inplace=True)
df.dropna(axis='columns', inplace=True)
print(df.shape)

(1999515, 21)


In [4]:
# remove duplicados
df.drop_duplicates(subset=['comment_text',], keep='first', inplace=True)
df.shape # 1999515 - 1971915 = 27600 duplicados

(1971915, 21)

In [5]:
# remove os caracteres \xad que aparecem em alguns spams
df.comment_text = df.comment_text.replace('\xad', '', regex=True) 

In [6]:
# expand contractions
import contractions
df.comment_text = df.comment_text.apply(contractions.fix)

In [7]:
df['label'] = np.where(df['toxicity'] >= 0.5, 1, 0)

In [8]:
from datasets import Dataset, DatasetDict
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


X = df.comment_text.to_numpy().reshape(-1,1)
y = np.where(df[['toxicity']] >= 0.5, 1, 0).reshape(-1,1)

under_sampler = RandomUnderSampler(random_state=0)
X, y = under_sampler.fit_resample(X, y)

raw_datasets = Dataset.from_dict({
    'comment_text': X.ravel(),
    'label': y,
}).train_test_split(train_size=0.9, test_size=0.1)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 283813
    })
    test: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 31535
    })
})

In [25]:
# 1-grams
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline

nltk.download('stopwords')

tknzr = TweetTokenizer(preserve_case=False, reduce_len=False)
vectorizer = TfidfVectorizer(ngram_range=(1,1), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))

pipeline = Pipeline([
    ('vectorizer', vectorizer),
])
features = pipeline.fit_transform(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])
feature_names = pipeline['vectorizer'].get_feature_names_out()


[nltk_data] Downloading package stopwords to /home/rafael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
feature_names[1:10]

array(['\x13', '!', '"', '#', '###ing', '###konki', '###off', '##ed',
       '##hole'], dtype=object)

In [27]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(features, raw_datasets['train']['label'])

MultinomialNB()

In [53]:
from datasets import load_metric
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

preds = clf.predict(pipeline.transform(raw_datasets["test"]["comment_text"]))
print(preds.shape)

metric = load_metric("glue", "mrpc")
metric.compute(predictions=preds, references=raw_datasets["test"]["label"])

(31535,)


{'accuracy': 0.7905501823370857, 'f1': 0.802782837180138}

In [54]:
confusion_matrix(raw_datasets["test"]["label"], preds)

array([[11487,  4200],
       [ 2405, 13443]])

In [15]:
from pprint import pp

pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))

{'0': {'precision': 0.8268787791534696,
       'recall': 0.7322623828647925,
       'f1-score': 0.776699685587748,
       'support': 15687},
 '1': {'precision': 0.7619452474069036,
       'recall': 0.8482458354366481,
       'f1-score': 0.802782837180138,
       'support': 15848},
 'accuracy': 0.7905501823370857,
 'macro avg': {'precision': 0.7944120132801866,
               'recall': 0.7902541091507203,
               'f1-score': 0.789741261383943,
               'support': 31535},
 'weighted avg': {'precision': 0.7942462562069158,
                  'recall': 0.7905501823370857,
                  'f1-score': 0.7898078443458325,
                  'support': 31535}}


In [16]:
# Ngrams 1-4

vectorizer = TfidfVectorizer(ngram_range=(1,4), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', MultinomialNB())
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))




{'accuracy': 0.7995877596321548, 'f1': 0.808310585380649}
array([[11890,  3797],
       [ 2523, 13325]])
{'0': {'precision': 0.8249496981891348,
       'recall': 0.7579524446994327,
       'f1-score': 0.7900332225913621,
       'support': 15687},
 '1': {'precision': 0.77823852353697,
       'recall': 0.8408001009591115,
       'f1-score': 0.808310585380649,
       'support': 15848},
 'accuracy': 0.7995877596321548,
 'macro avg': {'precision': 0.8015941108630524,
               'recall': 0.7993762728292721,
               'f1-score': 0.7991719039860056,
               'support': 31535},
 'weighted avg': {'precision': 0.8014748703506218,
                  'recall': 0.7995877596321548,
                  'f1-score': 0.7992185609609395,
                  'support': 31535}}


In [17]:
# Removing non alpha characters
from sklearn.preprocessing import FunctionTransformer


def remove_non_alpha(vec):
    for x in vec:
        x = x.split()
        yield ' '.join([s for s in x if s.isalpha()])

pipeline = Pipeline([
    ('remove_non_alpha', FunctionTransformer(remove_non_alpha)),
    ('vectorizer', vectorizer),
    ('clf', MultinomialNB())
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))

{'accuracy': 0.7581734580624703, 'f1': 0.7590673575129534}
array([[11896,  3791],
       [ 3835, 12013]])
{'0': {'precision': 0.7562138452736634,
       'recall': 0.7583349270096258,
       'f1-score': 0.757272900884843,
       'support': 15687},
 '1': {'precision': 0.7601240192356365,
       'recall': 0.7580136294800606,
       'f1-score': 0.7590673575129534,
       'support': 15848},
 'accuracy': 0.7581734580624703,
 'macro avg': {'precision': 0.75816893225465,
               'recall': 0.7581742782448432,
               'f1-score': 0.7581701291988983,
               'support': 31535},
 'weighted avg': {'precision': 0.7581789138308016,
                  'recall': 0.7581734580624703,
                  'f1-score': 0.7581747099427879,
                  'support': 31535}}


In [18]:
# Feature Selection
from sklearn.feature_selection import SelectKBest


pipeline = Pipeline([
    ('remove_non_alpha', FunctionTransformer(remove_non_alpha)),
    ('vectorizer', vectorizer),
    ('kbest', SelectKBest(k=2500)),
    ('clf', MultinomialNB())
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))

{'accuracy': 0.7591564927857936, 'f1': 0.7483849594169291}
array([[12645,  3042],
       [ 4553, 11295]])
{'0': {'precision': 0.7352599139434818,
       'recall': 0.8060814687320711,
       'f1-score': 0.7690436369165273,
       'support': 15687},
 '1': {'precision': 0.7878217200251099,
       'recall': 0.7127082281675922,
       'f1-score': 0.7483849594169291,
       'support': 15848},
 'accuracy': 0.7591564927857936,
 'macro avg': {'precision': 0.7615408169842959,
               'recall': 0.7593948484498316,
               'f1-score': 0.7587142981667282,
               'support': 31535},
 'weighted avg': {'precision': 0.7616749925159138,
                  'recall': 0.7591564927857936,
                  'f1-score': 0.7586615623640101,
                  'support': 31535}}


In [19]:
# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

vectorizer = TfidfVectorizer(ngram_range=(1,1), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))


pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', DecisionTreeClassifier())
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))



{'accuracy': 0.8101157444109719, 'f1': 0.8109848484848485}
array([[12701,  2986],
       [ 3002, 12846]])
{'0': {'precision': 0.8088263389161306,
       'recall': 0.8096513036272073,
       'f1-score': 0.8092386110226185,
       'support': 15687},
 '1': {'precision': 0.8113946437594745,
       'recall': 0.810575466935891,
       'f1-score': 0.8109848484848485,
       'support': 15848},
 'accuracy': 0.8101157444109719,
 'macro avg': {'precision': 0.8101104913378026,
               'recall': 0.8101133852815492,
               'f1-score': 0.8101117297537335,
               'support': 31535},
 'weighted avg': {'precision': 0.8101170474988899,
                  'recall': 0.8101157444109719,
                  'f1-score': 0.8101161874076326,
                  'support': 31535}}


In [55]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', LogisticRegression(max_iter=1000))
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))



{'accuracy': 0.8680196606944665, 'f1': 0.8642530984996738}
array([[14124,  1563],
       [ 2599, 13249]])
{'0': {'precision': 0.8445853016803205,
       'recall': 0.9003633581946835,
       'f1-score': 0.8715828448009874,
       'support': 15687},
 '1': {'precision': 0.894477450715636,
       'recall': 0.8360045431600202,
       'f1-score': 0.8642530984996738,
       'support': 15848},
 'accuracy': 0.8680196606944665,
 'macro avg': {'precision': 0.8695313761979783,
               'recall': 0.8681839506773519,
               'f1-score': 0.8679179716503306,
               'support': 31535},
 'weighted avg': {'precision': 0.8696587368447943,
                  'recall': 0.8680196606944665,
                  'f1-score': 0.8678992608662096,
                  'support': 31535}}


In [56]:
# LogisticRegression com Feature Selection
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('kbest', SelectKBest(k=1000)),
    ('clf', LogisticRegression(max_iter=1000))
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))



{'accuracy': 0.8561281116220073, 'f1': 0.8501700736435389}
array([[14126,  1561],
       [ 2976, 12872]])
{'0': {'precision': 0.8259852648813004,
       'recall': 0.9004908522980812,
       'f1-score': 0.861630424837598,
       'support': 15687},
 '1': {'precision': 0.8918450772535162,
       'recall': 0.812216052498738,
       'f1-score': 0.8501700736435389,
       'support': 15848},
 'accuracy': 0.8561281116220073,
 'macro avg': {'precision': 0.8589151710674083,
               'recall': 0.8563534523984095,
               'f1-score': 0.8559002492405685,
               'support': 31535},
 'weighted avg': {'precision': 0.8590832926750178,
                  'recall': 0.8561281116220073,
                  'f1-score': 0.8558709941820265,
                  'support': 31535}}
