In [15]:
#geral
import pandas as pd
import numpy as np

#sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#visual
import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pp

In [2]:
df = pd.read_csv('data/all_data.csv')

In [3]:
df.dropna(subset=['comment_text', 'toxicity'], inplace=True)
df.dropna(axis='columns', inplace=True)
print(df.shape)

(1999515, 21)


In [4]:
# remove duplicados
df.drop_duplicates(subset=['comment_text',], keep='first', inplace=True)
df.shape # 1999515 - 1971915 = 27600 duplicados

(1971915, 21)

In [5]:
# remove os caracteres \xad que aparecem em alguns spams
df.comment_text = df.comment_text.replace('\xad', '', regex=True) 

In [6]:
# expand contractions
import contractions
df.comment_text = df.comment_text.apply(contractions.fix)

In [7]:
df['label'] = np.where(df['toxicity'] >= 0.5, 1, 0)

In [8]:
from datasets import Dataset, DatasetDict
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


X = df.comment_text.to_numpy().reshape(-1,1)
y = np.where(df[['toxicity']] >= 0.5, 1, 0).reshape(-1,1)

under_sampler = RandomUnderSampler(random_state=0)
X, y = under_sampler.fit_resample(X, y)

raw_datasets = Dataset.from_dict({
    'comment_text': X.ravel(),
    'label': y,
}).train_test_split(train_size=0.9, test_size=0.1)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 283813
    })
    test: Dataset({
        features: ['comment_text', 'label'],
        num_rows: 31535
    })
})

In [10]:
# 1-grams
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline

nltk.download('stopwords')

tknzr = TweetTokenizer(preserve_case=False, reduce_len=False)
vectorizer = TfidfVectorizer(ngram_range=(1,1), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))

pipeline = Pipeline([
    ('vectorizer', vectorizer),
])
features = pipeline.fit_transform(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])
feature_names = pipeline['vectorizer'].get_feature_names_out()


[nltk_data] Downloading package stopwords to /home/rafael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
feature_names[1:10]

array(['\x13', '!', '"', '#', '###ing', '###konki', '###off', '##ed',
       '##hole'], dtype=object)

In [12]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(features, raw_datasets['train']['label'])

MultinomialNB()

In [13]:
from datasets import load_metric
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

preds = clf.predict(pipeline.transform(raw_datasets["test"]["comment_text"]))
print(preds.shape)

metric = load_metric("glue", "mrpc")
metric.compute(predictions=preds, references=raw_datasets["test"]["label"])



(31535,)


{'accuracy': 0.7905501823370857, 'f1': 0.802782837180138}

In [13]:
confusion_matrix(raw_datasets["test"]["label"], preds)

array([[11487,  4200],
       [ 2405, 13443]])

In [14]:
from pprint import pp

pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))

{'0': {'precision': 0.8268787791534696,
       'recall': 0.7322623828647925,
       'f1-score': 0.776699685587748,
       'support': 15687},
 '1': {'precision': 0.7619452474069036,
       'recall': 0.8482458354366481,
       'f1-score': 0.802782837180138,
       'support': 15848},
 'accuracy': 0.7905501823370857,
 'macro avg': {'precision': 0.7944120132801866,
               'recall': 0.7902541091507203,
               'f1-score': 0.789741261383943,
               'support': 31535},
 'weighted avg': {'precision': 0.7942462562069158,
                  'recall': 0.7905501823370857,
                  'f1-score': 0.7898078443458325,
                  'support': 31535}}


In [25]:
# Ngrams 1-4 - kbest
from sklearn.feature_selection import SelectKBest


vectorizer = TfidfVectorizer(ngram_range=(1,4), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('kbest', SelectKBest(k=1000)),
    ('clf', MultinomialNB())
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))




{'accuracy': 0.8103060091961313, 'f1': 0.8052607591640081}
array([[13185,  2502],
       [ 3480, 12368]])
{'0': {'precision': 0.7911791179117912,
       'recall': 0.840504876649455,
       'f1-score': 0.8150964391691394,
       'support': 15687},
 '1': {'precision': 0.8317417619367855,
       'recall': 0.7804139323573952,
       'f1-score': 0.8052607591640081,
       'support': 15848},
 'accuracy': 0.8103060091961313,
 'macro avg': {'precision': 0.8114604399242884,
               'recall': 0.810459404503425,
               'f1-score': 0.8101785991665738,
               'support': 31535},
 'weighted avg': {'precision': 0.8115639849645297,
                  'recall': 0.8103060091961313,
                  'f1-score': 0.8101534914373708,
                  'support': 31535}}


In [21]:
# função que remove caracteres não alfabéticos
from sklearn.preprocessing import FunctionTransformer

def remove_non_alpha(vec):
    for x in vec:
        x = x.split()
        yield ' '.join([s for s in x if s.isalpha()])

In [13]:
# Naive Bayes - 1-grams - Removing non alpha characters
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import FunctionTransformer
from datasets import load_metric
from sklearn.metrics import confusion_matrix, classification_report
from pprint import pp

        
vectorizer = TfidfVectorizer(ngram_range=(1,1), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))
        
        
pipeline = Pipeline([
    ('remove_non_alpha', FunctionTransformer(remove_non_alpha)),
    ('vectorizer', vectorizer),
    ('clf', MultinomialNB())
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))

{'accuracy': 0.7546218487394958, 'f1': 0.7620102109860367}
array([[11409,  4278],
       [ 3460, 12388]])
{'0': {'precision': 0.7673010962405004,
       'recall': 0.7272901128322815,
       'f1-score': 0.7467600471265872,
       'support': 15687},
 '1': {'precision': 0.7433097323892955,
       'recall': 0.781675921251893,
       'f1-score': 0.7620102109860367,
       'support': 15848},
 'accuracy': 0.7546218487394958,
 'macro avg': {'precision': 0.755305414314898,
               'recall': 0.7544830170420873,
               'f1-score': 0.7543851290563119,
               'support': 31535},
 'weighted avg': {'precision': 0.7552441710997395,
                  'recall': 0.7546218487394958,
                  'f1-score': 0.7544240584424127,
                  'support': 31535}}


In [19]:
# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

vectorizer = TfidfVectorizer(ngram_range=(1,1), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))


pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', DecisionTreeClassifier())
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))



{'accuracy': 0.8101157444109719, 'f1': 0.8109848484848485}
array([[12701,  2986],
       [ 3002, 12846]])
{'0': {'precision': 0.8088263389161306,
       'recall': 0.8096513036272073,
       'f1-score': 0.8092386110226185,
       'support': 15687},
 '1': {'precision': 0.8113946437594745,
       'recall': 0.810575466935891,
       'f1-score': 0.8109848484848485,
       'support': 15848},
 'accuracy': 0.8101157444109719,
 'macro avg': {'precision': 0.8101104913378026,
               'recall': 0.8101133852815492,
               'f1-score': 0.8101117297537335,
               'support': 31535},
 'weighted avg': {'precision': 0.8101170474988899,
                  'recall': 0.8101157444109719,
                  'f1-score': 0.8101161874076326,
                  'support': 31535}}


In [22]:
# DecisionTreeClassifier - remove_non_alpha
from sklearn.tree import DecisionTreeClassifier

vectorizer = TfidfVectorizer(ngram_range=(1,1), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))


pipeline = Pipeline([
    ('remove_non_alpha', FunctionTransformer(remove_non_alpha)),
    ('vectorizer', vectorizer),
    ('clf', DecisionTreeClassifier())
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))

{'accuracy': 0.7054701125733313, 'f1': 0.7016957862281603}
array([[11323,  4364],
       [ 4924, 10924]])
{'0': {'precision': 0.6969286637533083,
       'recall': 0.7218078663861797,
       'f1-score': 0.7091501221268868,
       'support': 15687},
 '1': {'precision': 0.7145473574045003,
       'recall': 0.6892983341746592,
       'f1-score': 0.7016957862281603,
       'support': 15848},
 'accuracy': 0.7054701125733313,
 'macro avg': {'precision': 0.7057380105789043,
               'recall': 0.7055531002804194,
               'f1-score': 0.7054229541775235,
               'support': 31535},
 'weighted avg': {'precision': 0.7057829861564822,
                  'recall': 0.7054701125733313,
                  'f1-score': 0.7054039253511449,
                  'support': 31535}}


In [17]:
# DecisionTreeClassifier 1-4-grams - kbest
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest


vectorizer = TfidfVectorizer(ngram_range=(1,4), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))


pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('kbest', SelectKBest(k=1000)),
    ('clf', DecisionTreeClassifier())
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))



{'accuracy': 0.7419375297288727, 'f1': 0.7501381639545593}
array([[11181,  4506],
       [ 3632, 12216]])
{'0': {'precision': 0.7548099642206171,
       'recall': 0.7127557850449416,
       'f1-score': 0.7331803278688526,
       'support': 15687},
 '1': {'precision': 0.7305346250448511,
       'recall': 0.7708228167592125,
       'f1-score': 0.7501381639545593,
       'support': 15848},
 'accuracy': 0.7419375297288727,
 'macro avg': {'precision': 0.7426722946327341,
               'recall': 0.7417893009020771,
               'f1-score': 0.741659245911706,
               'support': 31535},
 'weighted avg': {'precision': 0.742610326508312,
                  'recall': 0.7419375297288727,
                  'f1-score': 0.7417025345054875,
                  'support': 31535}}


In [16]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression


vectorizer = TfidfVectorizer(ngram_range=(1,1), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))

pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', LogisticRegression(max_iter=1000))
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))



{'accuracy': 0.8680196606944665, 'f1': 0.8642530984996738}
array([[14124,  1563],
       [ 2599, 13249]])
{'0': {'precision': 0.8445853016803205,
       'recall': 0.9003633581946835,
       'f1-score': 0.8715828448009874,
       'support': 15687},
 '1': {'precision': 0.894477450715636,
       'recall': 0.8360045431600202,
       'f1-score': 0.8642530984996738,
       'support': 15848},
 'accuracy': 0.8680196606944665,
 'macro avg': {'precision': 0.8695313761979783,
               'recall': 0.8681839506773519,
               'f1-score': 0.8679179716503306,
               'support': 31535},
 'weighted avg': {'precision': 0.8696587368447943,
                  'recall': 0.8680196606944665,
                  'f1-score': 0.8678992608662096,
                  'support': 31535}}


In [19]:
# Dataset Desbalanceado - Logistic

test_df = df[df['split'] == 'test']

preds = pipeline.predict(test_df['comment_text'])

pp(confusion_matrix(test_df["label"], preds))
pp(classification_report(test_df["label"], preds, output_dict=True))



array([[159348,  17196],
       [  2163,  13080]])
{'0': {'precision': 0.9866077233129632,
       'recall': 0.9025965198477434,
       'f1-score': 0.9427341704752186,
       'support': 176544},
 '1': {'precision': 0.43202536662703134,
       'recall': 0.8580987994489274,
       'f1-score': 0.5747050682132736,
       'support': 15243},
 'accuracy': 0.8990598945705391,
 'macro avg': {'precision': 0.7093165449699973,
               'recall': 0.8803476596483354,
               'f1-score': 0.758719619344246,
               'support': 191787},
 'weighted avg': {'precision': 0.9425301848824978,
                  'recall': 0.8990598945705391,
                  'f1-score': 0.9134836602436657,
                  'support': 191787}}


In [23]:
df = pd.read_csv('data/all_data.csv')

In [40]:
# Análise de Viés

df.dropna(subset=['comment_text', 'toxicity'], inplace=True)
# df.dropna(axis='columns', inplace=True)
# remove duplicados
df.drop_duplicates(subset=['comment_text',], keep='first', inplace=True)
df.comment_text = df.comment_text.replace('\xad', '', regex=True)  # remove os caracteres \xad que aparecem em alguns spams
df.isnull().sum()
bias_df = df.dropna(subset=['male',])
bias_df['label'] = np.where(bias_df['toxicity'] >= 0.5, 1, 0)
bias_df
# preds = pipeline.predict(test_df['comment_text'])

# pp(confusion_matrix(test_df["label"], preds))
# pp(classification_report(test_df["label"], preds, output_dict=True))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bias_df['label'] = np.where(bias_df['toxicity'] >= 0.5, 1, 0)


Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,...,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count,label
7678,627762,OH yes - Were those evil Christian Missionarie...,train,2016-11-26 15:56:03.862109+00,13,627198.0,152737,approved,0,0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,4,10,1
7679,5892815,Why is this black racist crap still on the G&M...,train,2017-09-03 23:20:08.226613+00,54,,373428,rejected,0,0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,4,70,1
7680,416437,even up here.......BLACKS!,train,2016-08-04 16:48:07.175252+00,21,,143025,rejected,0,0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,4,61,1
7681,5137126,Blame men. There's always an excuse to blame ...,train,2017-04-15 19:00:45.032674+00,54,5136907.0,327125,rejected,0,0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,4,11,1
7682,855753,And the woman exposing herself saying grab thi...,train,2017-01-18 01:50:57.478867+00,13,849081.0,162008,rejected,0,0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,4,70,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999511,1018736,Another man shamming article. If white men did...,train,2017-02-20 07:20:49.964620+00,54,,169202,approved,0,0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,10,10,0
1999512,340016,"""no matter what is put in front of you regardi...",train,2016-06-06 06:43:04.780968+00,21,339965.0,137961,approved,0,0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,10,10,0
1999513,919629,The Democrat party aided and abetted by it's M...,train,2017-01-30 02:44:29.168863+00,54,,164845,rejected,0,1,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,11,10,0
1999514,5165492,I just don't find her a very good representati...,train,2017-04-22 18:42:02.442987+00,54,,328877,approved,1,0,...,0.0,0.0,0.0,0.003717,0.0,0.0,0.00000,269,10,0


In [41]:
# Análise Bias - Male

tbias_df = bias_df[bias_df['split'] == 'test']
tbias_df = tbias_df[tbias_df['male'] >= 0.5]

preds = pipeline.predict(tbias_df['comment_text'])

pp(confusion_matrix(tbias_df["label"], preds))
pp(classification_report(tbias_df["label"], preds, output_dict=True))



array([[2617, 1080],
       [  61,  605]])
{'0': {'precision': 0.9772218073188947,
       'recall': 0.7078712469569921,
       'f1-score': 0.8210196078431372,
       'support': 3697},
 '1': {'precision': 0.3590504451038576,
       'recall': 0.9084084084084084,
       'f1-score': 0.5146746065504042,
       'support': 666},
 'accuracy': 0.7384826953930782,
 'macro avg': {'precision': 0.6681361262113762,
               'recall': 0.8081398276827003,
               'f1-score': 0.6678471071967707,
               'support': 4363},
 'weighted avg': {'precision': 0.8828596420117175,
                  'recall': 0.7384826953930782,
                  'f1-score': 0.774256882456715,
                  'support': 4363}}


In [42]:
# Análise Bias - Female

tbias_df = bias_df[bias_df['split'] == 'test']
tbias_df = tbias_df[tbias_df['female'] >= 0.5]

preds = pipeline.predict(tbias_df['comment_text'])

pp(confusion_matrix(tbias_df["label"], preds))
pp(classification_report(tbias_df["label"], preds, output_dict=True))



array([[3340, 1109],
       [  83,  606]])
{'0': {'precision': 0.9757522640958224,
       'recall': 0.7507305012362329,
       'f1-score': 0.8485772357723577,
       'support': 4449},
 '1': {'precision': 0.3533527696793003,
       'recall': 0.8795355587808418,
       'f1-score': 0.5041597337770382,
       'support': 689},
 'accuracy': 0.7680031140521604,
 'macro avg': {'precision': 0.6645525168875613,
               'recall': 0.8151330300085373,
               'f1-score': 0.6763684847746979,
               'support': 5138},
 'weighted avg': {'precision': 0.8922891944864444,
                  'recall': 0.7680031140521604,
                  'f1-score': 0.8023912375483844,
                  'support': 5138}}


In [44]:
# Análise Bias - homosexual gay or lesbian

tbias_df = bias_df[bias_df['split'] == 'test']
tbias_df = tbias_df[tbias_df['homosexual_gay_or_lesbian'] >= 0.5]

preds = pipeline.predict(tbias_df['comment_text'])

pp(confusion_matrix(tbias_df["label"], preds))
pp(classification_report(tbias_df["label"], preds, output_dict=True))



array([[361, 414],
       [ 31, 258]])
{'0': {'precision': 0.9209183673469388,
       'recall': 0.46580645161290324,
       'f1-score': 0.6186803770351328,
       'support': 775},
 '1': {'precision': 0.38392857142857145,
       'recall': 0.8927335640138409,
       'f1-score': 0.5369406867845994,
       'support': 289},
 'accuracy': 0.581766917293233,
 'macro avg': {'precision': 0.6524234693877551,
               'recall': 0.679270007813372,
               'f1-score': 0.5778105319098661,
               'support': 1064},
 'weighted avg': {'precision': 0.7750630562375326,
                  'recall': 0.581766917293233,
                  'f1-score': 0.5964785250779859,
                  'support': 1064}}


In [46]:
# Análise Bias - christian

tbias_df = bias_df[bias_df['split'] == 'test']
tbias_df = tbias_df[tbias_df['christian'] >= 0.5]

preds = pipeline.predict(tbias_df['comment_text'])

pp(confusion_matrix(tbias_df["label"], preds))
pp(classification_report(tbias_df["label"], preds, output_dict=True))



array([[3168,  616],
       [  67,  347]])
{'0': {'precision': 0.9792890262751159,
       'recall': 0.8372093023255814,
       'f1-score': 0.9026926912665622,
       'support': 3784},
 '1': {'precision': 0.3603322949117342,
       'recall': 0.8381642512077294,
       'f1-score': 0.5039941902687001,
       'support': 414},
 'accuracy': 0.8373034778465936,
 'macro avg': {'precision': 0.6698106605934251,
               'recall': 0.8376867767666554,
               'f1-score': 0.7033434407676311,
               'support': 4198},
 'weighted avg': {'precision': 0.9182485101282747,
                  'recall': 0.8373034778465936,
                  'f1-score': 0.8633736871186073,
                  'support': 4198}}


In [47]:
# Análise Bias - jewish

tbias_df = bias_df[bias_df['split'] == 'test']
tbias_df = tbias_df[tbias_df['jewish'] >= 0.5]

preds = pipeline.predict(tbias_df['comment_text'])

pp(confusion_matrix(tbias_df["label"], preds))
pp(classification_report(tbias_df["label"], preds, output_dict=True))



array([[454, 240],
       [ 15, 122]])
{'0': {'precision': 0.9680170575692963,
       'recall': 0.654178674351585,
       'f1-score': 0.7807394668959589,
       'support': 694},
 '1': {'precision': 0.3370165745856354,
       'recall': 0.8905109489051095,
       'f1-score': 0.4889779559118237,
       'support': 137},
 'accuracy': 0.6931407942238267,
 'macro avg': {'precision': 0.6525168160774659,
               'recall': 0.7723448116283473,
               'f1-score': 0.6348587114038913,
               'support': 831},
 'weighted avg': {'precision': 0.8639893004468397,
                  'recall': 0.6931407942238267,
                  'f1-score': 0.7326391937252893,
                  'support': 831}}


In [48]:
# Análise Bias - muslim

tbias_df = bias_df[bias_df['split'] == 'test']
tbias_df = tbias_df[tbias_df['muslim'] >= 0.5]

preds = pipeline.predict(tbias_df['comment_text'])

pp(confusion_matrix(tbias_df["label"], preds))
pp(classification_report(tbias_df["label"], preds, output_dict=True))



array([[770, 764],
       [ 38, 441]])
{'0': {'precision': 0.9529702970297029,
       'recall': 0.5019556714471969,
       'f1-score': 0.6575576430401366,
       'support': 1534},
 '1': {'precision': 0.3659751037344398,
       'recall': 0.9206680584551148,
       'f1-score': 0.5237529691211401,
       'support': 479},
 'accuracy': 0.6015896671634376,
 'macro avg': {'precision': 0.6594727003820714,
               'recall': 0.7113118649511558,
               'f1-score': 0.5906553060806383,
               'support': 2013},
 'weighted avg': {'precision': 0.8132928516305817,
                  'recall': 0.6015896671634376,
                  'f1-score': 0.6257183788537485,
                  'support': 2013}}


In [49]:
# Análise Bias - black

tbias_df = bias_df[bias_df['split'] == 'test']
tbias_df = tbias_df[tbias_df['black'] >= 0.5]

preds = pipeline.predict(tbias_df['comment_text'])

pp(confusion_matrix(tbias_df["label"], preds))
pp(classification_report(tbias_df["label"], preds, output_dict=True))



array([[375, 636],
       [ 24, 472]])
{'0': {'precision': 0.9398496240601504,
       'recall': 0.37091988130563797,
       'f1-score': 0.5319148936170213,
       'support': 1011},
 '1': {'precision': 0.4259927797833935,
       'recall': 0.9516129032258065,
       'f1-score': 0.5885286783042394,
       'support': 496},
 'accuracy': 0.5620437956204379,
 'macro avg': {'precision': 0.682921201921772,
               'recall': 0.6612663922657223,
               'f1-score': 0.5602217859606303,
               'support': 1507},
 'weighted avg': {'precision': 0.7707235492351527,
                  'recall': 0.5620437956204379,
                  'f1-score': 0.5505482295193836,
                  'support': 1507}}


In [50]:
# Análise Bias - white

tbias_df = bias_df[bias_df['split'] == 'test']
tbias_df = tbias_df[tbias_df['white'] >= 0.5]

preds = pipeline.predict(tbias_df['comment_text'])

pp(confusion_matrix(tbias_df["label"], preds))
pp(classification_report(tbias_df["label"], preds, output_dict=True))



array([[ 685, 1010],
       [  28,  705]])
{'0': {'precision': 0.9607293127629734,
       'recall': 0.40412979351032446,
       'f1-score': 0.5689368770764119,
       'support': 1695},
 '1': {'precision': 0.4110787172011662,
       'recall': 0.9618008185538881,
       'f1-score': 0.5759803921568628,
       'support': 733},
 'accuracy': 0.5724876441515651,
 'macro avg': {'precision': 0.6859040149820698,
               'recall': 0.6829653060321063,
               'f1-score': 0.5724586346166374,
               'support': 2428},
 'weighted avg': {'precision': 0.7947927861786223,
                  'recall': 0.5724876441515651,
                  'f1-score': 0.571063275986614,
                  'support': 2428}}


In [51]:
# Análise Bias - psychiatric_or_mental_illness

tbias_df = bias_df[bias_df['split'] == 'test']
tbias_df = tbias_df[tbias_df['psychiatric_or_mental_illness'] >= 0.5]

preds = pipeline.predict(tbias_df['comment_text'])

pp(confusion_matrix(tbias_df["label"], preds))
pp(classification_report(tbias_df["label"], preds, output_dict=True))

array([[244, 160],
       [  5,  99]])
{'0': {'precision': 0.9799196787148594,
       'recall': 0.6039603960396039,
       'f1-score': 0.7473200612557428,
       'support': 404},
 '1': {'precision': 0.38223938223938225,
       'recall': 0.9519230769230769,
       'f1-score': 0.5454545454545454,
       'support': 104},
 'accuracy': 0.6751968503937008,
 'macro avg': {'precision': 0.6810795304771209,
               'recall': 0.7779417364813404,
               'f1-score': 0.6463873033551442,
               'support': 508},
 'weighted avg': {'precision': 0.8575599329797224,
                  'recall': 0.6751968503937008,
                  'f1-score': 0.7059932627452614,
                  'support': 508}}




In [23]:
# LogisticRegression - remove_non_alpha
from sklearn.linear_model import LogisticRegression


vectorizer = TfidfVectorizer(ngram_range=(1,1), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))

pipeline = Pipeline([
    ('remove_non_alpha', FunctionTransformer(remove_non_alpha)),
    ('vectorizer', vectorizer),
    ('clf', LogisticRegression(max_iter=1000))
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))

{'accuracy': 0.7907404471222451, 'f1': 0.7781475878298872}
array([[13363,  2324],
       [ 4275, 11573]])
{'0': {'precision': 0.7576255811316476,
       'recall': 0.8518518518518519,
       'f1-score': 0.801980495123781,
       'support': 15687},
 '1': {'precision': 0.8327696625170901,
       'recall': 0.7302498738011105,
       'f1-score': 0.7781475878298872,
       'support': 15848},
 'accuracy': 0.7907404471222451,
 'macro avg': {'precision': 0.7951976218243688,
               'recall': 0.7910508628264812,
               'f1-score': 0.7900640414768341,
               'support': 31535},
 'weighted avg': {'precision': 0.7953894435637545,
                  'recall': 0.7907404471222451,
                  'f1-score': 0.7900032027567719,
                  'support': 31535}}


In [24]:
# LogisticRegression 1-4-gram com Feature Selection
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer(ngram_range=(1,4), tokenizer=tknzr.tokenize, strip_accents='unicode', lowercase=True, stop_words=stopwords.words('english'))


pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('kbest', SelectKBest(k=1000)),
    ('clf', LogisticRegression(max_iter=1000))
])

pipeline.fit(raw_datasets['train']['comment_text'], raw_datasets['train']['label'])


preds = pipeline.predict(raw_datasets["test"]["comment_text"])

metric = load_metric("glue", "mrpc")
pp(metric.compute(predictions=preds, references=raw_datasets["test"]["label"]))
pp(confusion_matrix(raw_datasets["test"]["label"], preds))
pp(classification_report(raw_datasets["test"]["label"], preds, output_dict=True))



{'accuracy': 0.8329792294276201, 'f1': 0.8358729861955065}
array([[12856,  2831],
       [ 2436, 13412]])
{'0': {'precision': 0.8407010201412504,
       'recall': 0.8195320966405304,
       'f1-score': 0.8299816004390071,
       'support': 15687},
 '1': {'precision': 0.8257095364156868,
       'recall': 0.8462897526501767,
       'f1-score': 0.8358729861955065,
       'support': 15848},
 'accuracy': 0.8329792294276201,
 'macro avg': {'precision': 0.8332052782784686,
               'recall': 0.8329109246453535,
               'f1-score': 0.8329272933172569,
               'support': 31535},
 'weighted avg': {'precision': 0.8331670092301126,
                  'recall': 0.8329792294276201,
                  'f1-score': 0.8329423323707974,
                  'support': 31535}}
