In [5]:
import csv
import argparse
import pandas as pd
import numpy as np
import re
import string
import os
import statistics
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import GaussianNB
    
TRAIN = '../data/train/training_data.csv'
train_data = pd.read_csv(TRAIN, index_col=1)
dev_data = pd.read_csv('../data/dev/development_data.csv', index_col=1)

tweets = train_data[['tweet']]

In [44]:
bad_words_set = set(open("bad-words.txt").read().split())
words_set = set(open("allwords.txt").read().split())
words_set = set(item.lower() for item in words_set)

## Features

In [11]:
# loading the emoji dataset
def load_unicode_mapping():
    emoji_dict = {}
    with open("emoji_image_to_whatIs.txt", 'r') as f:
        lines = f.readlines()
        for line in lines:
            tokens = line.strip().split('\t')
            emoji_dict[tokens[0]] = tokens[1]
    return emoji_dict
emoji_dict = load_unicode_mapping()
emoji_dict

{'©': 'copyright_sign',
 '®': 'registered_sign',
 '‼': 'double_exclamation_mark',
 '⁉': 'exclamation_question_mark',
 '™': 'trade_mark_sign',
 'ℹ': 'information_source',
 '↔': 'left_right_arrow',
 '↕': 'up_down_arrow',
 '↖': 'north_west_arrow',
 '↗': 'north_east_arrow',
 '↘': 'south_east_arrow',
 '↙': 'south_west_arrow',
 '↩': 'leftwards_arrow_with_hook',
 '↪': 'rightwards_arrow_with_hook',
 '⌚': 'watch',
 '⌛': 'hourglass',
 '⌨': 'keyboard',
 '⏏': 'eject_symbol',
 '⏩': 'black_right_pointing_double_triangle',
 '⏪': 'black_left_pointing_double_triangle',
 '⏫': 'black_up_pointing_double_triangle',
 '⏬': 'black_down_pointing_double_triangle',
 '⏭': 'black_right_pointing_double_triangle_with_vertical_bar',
 '⏮': 'black_left_pointing_double_triangle_with_vertical_bar',
 '⏯': 'black_right_pointing_triangle_with_double_vertical_bar',
 '⏰': 'alarm_clock',
 '⏱': 'stopwatch',
 '⏲': 'timer_clock',
 '⏳': 'hourglass_with_flowing_sand',
 '⏸': 'double_vertical_bar',
 '⏹': 'black_square_for_stop',
 '⏺'

In [14]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
tweetCounts = []
# get unigram counts for data
def emoji_counts():
    uni = Counter()
    tweets = train_data[['tweet']]
    for row_index, row in tweets.iterrows():
        s = tknzr.tokenize(row['tweet'])
        for word in s:
            count = sum(map(lambda word : 1 if word in emoji_dict else 0, s))
            tweetCounts.append(count)
    return find_quartile_values(tweetCounts)


Unigrams for the data splitting on spaces

In [45]:
def get_unigrams():
    unigrams = Counter()
    for row_index, row in tweets.iterrows():
        s = row['tweet'].split()
        for word in s:
            unigrams[word] += 1
    return unigrams

Make count features binary by finding median values over entire dataset

In [46]:
#split into quartiles?
def find_quartile_values(counts):
    counts.sort()
    first = np.quantile(counts, .25)
    second = np.quantile(counts, .5)
    third = np.quantile(counts, .75)
    return [first, second, third] 

In [47]:
def make_at_bins():
    at_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '@' in word else 0, s))
        at_counts.append(count)
    return find_quartile_values(at_counts)

In [53]:
def make_num_token_bins():
    lens = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = len(s)
        lens.append(count)
    
    return find_quartile_values(lens)

In [49]:
def make_swear_bins():
    bad_words_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot_bad = 0
        for word in s:
            word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace(";","")
            if word.lower() in bad_words_set:
                tot_bad+=1
        bad_words_counts.append(tot_bad)
    
    return find_quartile_values(bad_words_counts)

In [50]:
def make_mention_bins():
    mentions = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '@' in word else 0, s))
        lens.append(count)
    
    return find_quartile_values(mentions)

In [51]:
def make_hashtag_bins():
    hashtag_counts = []
    at_sum = 0
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if '#' in word else 0, s))
        hashtag_counts.append(count)
    
    return find_quartile_values(hashtag_counts)

In [52]:
def make_misspelling_bins():
    misspell_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        tot_misspelled = 0
        for word in s:
            word = word.replace(".","").replace(",","").replace("!","").replace("?","").replace(";","")
            if word.lower() not in words_set:
                tot_misspelled+=1
        misspell_counts.append(tot_misspelled)
    
    return find_quartile_values(misspell_counts)

[1.0, 2.0, 4.0]

In [12]:
def make_hatebase_bins():
    hatabase_words_set = set(open("hatebase_terms.txt").read().split())
    hatebase_words_counts = []
    
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        count = sum(map(lambda word : 1 if word in hatabase_words_set else 0, s))
        hatebase_words_counts.append(count)
    
    hatebase_words_counts.sort()
    median = statistics.median(hatebase_words_counts) #Get Median for binning
    
    return [0, median]


In [4]:
#Get All Eng Vocabulary
import json 
import requests
import pandas as pd
from hatebase import HatebaseAPI
key = 'rTUCCDVYCcsEGmVKzQJjKwDFQsNcvUNa'
hatebase = HatebaseAPI({"key": key})
filters = {"language": "eng"}
format = "json"
# initialize list for all vocabulary entry dictionaries
eng_vocab = []
response = hatebase.getVocabulary(filters=filters, format=format)
pages = response["number_of_pages"]
# fill the vocabulary list with all entries of all pages
# this might take some time...
for page in range(1, pages+1):
    filters["page"] = str(page) 
    response = hatebase.getVocabulary(filters=filters, format=format)
    eng_vocab.append(response["result"])

ModuleNotFoundError: No module named 'hatebase'

In [54]:
df_eng_vocab = pd.DataFrame()
# fill df
for elem in eng_vocab:
    df_eng_vocab = df_eng_vocab.append(elem)
# reset the df index
df_eng_vocab.reset_index(drop=True, inplace=True)

In [56]:
#df_eng_vocab

In [75]:
hatebase_words_set = set(open("hatebase_terms.txt").read().split())

In [3]:
#Get lists of all words from hatebase pertaining to a certain category"
hb_religion = df_eng_vocab.loc[df_eng_vocab['is_about_religion']][['term']].values
hb_sexual_orientation = df_eng_vocab.loc[df_eng_vocab['is_about_sexual_orientation']][['term']].values
hb_ethnicity = df_eng_vocab.loc[df_eng_vocab['is_about_ethnicity']][['term']].values
hb_disability = df_eng_vocab.loc[df_eng_vocab['is_about_disability']][['term']].values
hb_social_class = df_eng_vocab.loc[df_eng_vocab['is_about_class']][['term']].values
hb_nationality = df_eng_vocab.loc[df_eng_vocab['is_about_nationality']][['term']].values
hb_gender = df_eng_vocab.loc[df_eng_vocab['is_about_gender']][['term']].values

  

NameError: name 'df_eng_vocab' is not defined

In [None]:
import numpy as np
def is_about_nationality():
    is_about_nationality_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        totalIsAboutNationality = 0
        #check if it's a hatebase word
        for word in s:
            if word in hatabase_words_set:
                #select_indices = list(np.where(df_eng_vocab['is_about_nationality'] == True)[0])

                for index, hb_row in df_eng_vocab.iterrows():
                    if word == hb_row['term'] and hb_row['is_about_nationality'] == True:
                        totalIsAboutNationality+=1
        is_about_nationality_counts.append(totalIsAboutNationality)
        
    return find_quartile_values(is_about_nationality_counts)

hits = is_about_nationality()
len(hits)

In [None]:
import numpy as np
def is_about_class():
    is_about_class_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        total = 0
        #check if it's a hatebase word
        for word in s:
            if word in hatabase_words_set:
                #select_indices = list(np.where(df_eng_vocab['is_about_nationality'] == True)[0])
                for index, hb_row in df_eng_vocab.iterrows():
                    if word == hb_row['term'] and hb_row['is_about_class'] == True:
                        total+=1
        is_about_class_counts.append(total)
        
    return find_quartile_values(is_about_class_counts)

hits = is_about_class()


In [None]:
import numpy as np
def is_about_disability():
    is_about_disability_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        total = 0
        #check if it's a hatebase word
        for word in s:
            if word in hatabase_words_set:
                #select_indices = list(np.where(df_eng_vocab['is_about_nationality'] == True)[0])
                for index, hb_row in df_eng_vocab.iterrows():
                    if word == hb_row['term'] and hb_row['is_about_disability'] == True:
                        total+=1
        is_about_disability_counts.append(total)
        
    return find_quartile_values(is_about_disability_counts)

hits = is_about_disability()


In [None]:

import numpy as np
def is_about_ethnicity():
    is_about_ethnicity_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        total = 0
        #check if it's a hatebase word
        for word in s:
            if word in hatabase_words_set:
                #select_indices = list(np.where(df_eng_vocab['is_about_nationality'] == True)[0])
                for index, hb_row in df_eng_vocab.iterrows():
                    if word == hb_row['term'] and hb_row['is_about_ethnicity'] == True:
                        total+=1
        is_about_ethnicity_counts.append(total)
        
    return find_quartile_values(is_about_ethnicity_counts)

hits = is_about_ethnicity()


In [None]:

import numpy as np
def is_about_gender():
    is_about_gender_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        total = 0
        #check if it's a hatebase word
        for word in s:
            if word in hatabase_words_set:
                #select_indices = list(np.where(df_eng_vocab['is_about_nationality'] == True)[0])
                for index, hb_row in df_eng_vocab.iterrows():
                    if word == hb_row['term'] and hb_row['is_about_gender'] == True:
                        total+=1
        is_about_gender_counts.append(total)
        
    return find_quartile_values(is_about_gender_counts)

hits = is_about_gender()


In [None]:

import numpy as np
def is_about_religion():
    is_about_religion_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        total = 0
        #check if it's a hatebase word
        for word in s:
            if word in hatabase_words_set:
                #select_indices = list(np.where(df_eng_vocab['is_about_nationality'] == True)[0])
                for index, hb_row in df_eng_vocab.iterrows():
                    if word == hb_row['term'] and hb_row['is_about_religion'] == True:
                        total+=1
        is_about_religion_counts.append(total)
        
    return find_quartile_values(is_about_religion_counts)

hits = is_about_religion()


In [None]:
import numpy as np
def is_about_sexual_orientation():
    is_about_sexual_orientation_counts = []
    for row_index,row in tweets.iterrows():
        s = row['tweet'].split()
        total = 0
        #check if it's a hatebase word
        for word in s:
            if word in hatabase_words_set:
                #select_indices = list(np.where(df_eng_vocab['is_about_nationality'] == True)[0])
                for index, hb_row in df_eng_vocab.iterrows():
                    if word == hb_row['term'] and hb_row['is_about_sexual_orientation'] == True:
                        total+=1
        is_about_sexual_orientation_counts.append(total)
        
    return find_quartile_values(is_about_sexual_orientation_counts)

hits = is_about_sexual_orientation()


Get lists of all words from hatebase pertaining to a certain category

In [58]:
hb_religion = df_eng_vocab.loc[df_eng_vocab['is_about_religion']][['term']].values
hb_sexual_orientation = df_eng_vocab.loc[df_eng_vocab['is_about_sexual_orientation']][['term']].values
hb_ethnicity = df_eng_vocab.loc[df_eng_vocab['is_about_ethnicity']][['term']].values
hb_disability = df_eng_vocab.loc[df_eng_vocab['is_about_disability']][['term']].values
hb_social_class = df_eng_vocab.loc[df_eng_vocab['is_about_class']][['term']].values
hb_nationality = df_eng_vocab.loc[df_eng_vocab['is_about_nationality']][['term']].values
hb_gender = df_eng_vocab.loc[df_eng_vocab['is_about_gender']][['term']].values

Bins contain the splits for which bin a tweet's feature counts will land in

In [60]:
num_token_bins = make_num_token_bins()
swear_bins = make_swear_bins()
at_bins = make_at_bins()
hashtag_bins = make_hashtag_bins()
#hatebase_words_bins = hatebase_words_bins()
misspell_bins =  make_misspelling_bins()

More binary features

In [61]:
def has_more_upper(tweet):
    total_caps = sum(map(lambda ch : 1 if ch.isupper() else 0, tweet))
    if total_caps > len(tweet) // 2:
        return 1
    return 0

In [62]:
def has_consecutive_punc(tweet):
    for word in tweet.split():
        if 'http://' in word: continue
        for i in range(len(word)-1):
            if word[i] in string.punctuation and word[i+1] in string.punctuation:
                return 1
    return 0

In [63]:
third_person = ['he', 'she', 'they', 'him', 'her', 'them', 'his', 'hers', 'their', 'theirs', 'themselves', 'himself', 'herself']
second_person = ['you', 'your', 'yours']
first_person =['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']

In [64]:
# Find if first word in tweet is a third person pronoun. Also check if number of third person pronouns is greater
# than first person pronouns
def get_pronouns(tweet):
    first_pronoun = 0
    third_and_second_greater_than_first = 0
    words = tweet.split()
    if words[0] in third_person or words[0] in second_person: 
        first_pronoun = 1
    
    first_person_count = 0
    third_second_person_count = 0
    for word in words:
        if word in third_person or word in second_person:
            third_second_person_count += 1
        elif word in first_person:
            first_person_count += 1
            
    if first_person_count < third_second_person_count:
        third_and_second_greater_than_first = 1
       
    return first_pronoun, third_and_second_greater_than_first

Cite lexicon:
    Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." 
        Proceedings of the ACM SIGKDD International Conference on Knowledge 
        Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, 
        Washington, USA,

In [65]:
pos_words = set(open("opinion-lexicon-English/positive-words.txt").read().split())
neg_words = set(open("opinion-lexicon-English/negative-words.txt").read().split())

In [66]:
#returns 1, 1 if negative words outnumber positive words and there are no positive words
def get_sentiment(tweet):
    pos_count = 0
    neg_count = 0
    for word in tweet.split():
        if word in pos_words:
            pos_count += 1
        if word in neg_words:
            neg_count += 1
    if neg_count > pos_count:
        if pos_count > 0:
            return 1, 0
        else:
            return 1, 1
    else:
        if pos_count > 0:
            return 0, 0
        else:
            return 0, 1

In [67]:
#returns one if a word in the tweet has non alphanumeric characters (not including punctuation at the end of a word)
def contains_non_alphanum(tweet):
    words = tweet.split()
    for word in words:
        if not word.isalnum():
            if word[-1] not in string.punctuation:
                return 1
    return 0

In [68]:
#Find most common unigrams
unigram_counts = get_unigrams()
top_unigrams = unigram_counts.most_common(100)

In [69]:
#each count feature has four bins: 1 for < 25th percentile, 2 for < 50th percentile, 
#3 for < 75th percentile, and 4 for < 100th percentile
def find_bin(count, bin_name):
    for i in range(len(bin_name)):
        if count < bin_name[i]:
            return i + 1
    return len(bin_name)

Process the tweets by getting their feature representations

In [77]:
def process_tweets(tweets, judgements):
#     for word in [u[0] for u in top_unigrams]:
#         tweets[word] = tweets['tweet'].str.contains(word).astype(int)
    word_counts = []
    swear_counts = []
    at_counts = []
    contains_at = []
    hashtag_counts = []
    contains_hashtag = []
    consecutive_punc = []
    more_upper = []
    first_pronoun = []
    fewer_first_person = []
    more_negative = []
    no_positive = []
    contains_url = []
    not_alphanum = []
    misspellings = []
    disagreements = []
    in_hatebase = []
    about_gender = []
    about_religion = []
    about_sexual_orientation = []
    about_ethnicity = []
    about_disability = []
    about_social_class = []
    about_nationality = []

    for tweet in tweets['tweet']:
        #count tokens
        tweet_words = tweet.split()
        num_token_bin = find_bin(len(tweet_words), num_token_bins)
        word_counts.append(num_token_bin)
        
        #count swear words, misspellings, and if a word is in hatebase
        misspell_count = 0
        tot_bad = 0
        hatebase = 0
        hatebase_words = []
        for word in tweet_words:                #Use regexs? 
            word = word.replace(".","").replace(",","").replace("!","").replace("?","")
            if word.lower() in bad_words_set:
                tot_bad+=1
            if word.lower() not in words_set:
                misspell_count+=1
            if word.lower() in hatebase_words_set:
                hatebase = 1
                hatebase_words.append(word)
        swear_bin = find_bin(tot_bad, swear_bins)
        swear_counts.append(swear_bin)
        misspell_bin = find_bin(misspell_count, misspell_bins)
        misspellings.append(misspell_bin)
        in_hatebase.append(hatebase)
    
        gender = religion = sexual_orientation = ethnicity = disability = social_class = nationality = 0
        if hatebase == 1:
            for word in hatebase_words:
                if word in hb_gender: gender = 1
                if word in hb_religion: religion = 1
                if word in hb_sexual_orientation: sexual_orientation = 1
                if word in hb_ethnicity: ethnicity = 1
                if word in hb_disability: disability = 1
                if word in hb_social_class: social_class = 1
                if word in hb_nationality: nationality = 1
        about_gender.append(gender)
        about_religion.append(religion)
        about_sexual_orientation.append(sexual_orientation)
        about_ethnicity.append(ethnicity)
        about_disability.append(disability)
        about_social_class.append(social_class)
        about_nationality.append(nationality)
        
        #count mentions
        at_count = tweet.count('@')
        if at_count > 0:
            contains_at.append(1)
        else:
            contains_at.append(0)
        at_bin = find_bin(at_count, at_bins)
        at_counts.append(at_bin)
        
        #count hashtags
        hash_count = tweet.count('#')
        if hash_count > 0:
            contains_hashtag.append(1)
        else:
            contains_hashtag.append(0)
        hash_bin = find_bin(hash_count, hashtag_bins)
        hashtag_counts.append(hash_bin)

        more_upper.append(has_more_upper(tweet))
        consecutive_punc.append(has_consecutive_punc(tweet))
        first, more = get_pronouns(tweet)
        first_pronoun.append(first)
        fewer_first_person.append(more)
        
        more_neg, pos = get_sentiment(tweet)
        more_negative.append(more_neg)
        no_positive.append(pos)
        
        if 'http://' in tweet:
            contains_url.append(1)
        else:
            contains_url.append(0)
            
        not_alphanum.append(contains_non_alphanum(tweet))
        
        #See if there were disagreements about classification
        i = tweets.loc[tweets['tweet']==tweet].index[0]
        total_votes = judgements.at[i, 'count']
        if (judgements.at[i, 'hate_speech'] == total_votes) or (judgements.at[i, 'offensive_language'] == total_votes)\
            or (judgements.at[i, 'neither'] == total_votes):
            disagreements.append(0)
        else:
            disagreements.append(1)
        

    tweets['Word Counts'] = word_counts
    tweets['Swear Counts'] = swear_counts
    tweets['@ Counts'] = at_counts
    tweets['Mention'] = contains_at
    tweets['Hashtag Counts'] = hashtag_counts
    tweets['Contains Hashtag'] = contains_hashtag
    tweets['Consecutive Punctuation'] = consecutive_punc
    tweets['Majority Uppercase Letters'] = more_upper
    tweets['First Word Second or Third Person Pronoun'] = first_pronoun
    tweets['More Second or Third Person Pronouns than First'] = fewer_first_person
    tweets['Majority Negative Words'] = more_negative
    tweets['No Positive Words'] = no_positive
    tweets['Contains URL'] = contains_url
    tweets['Contains Non Alphanumeric Word'] = not_alphanum
    tweets['Misspelling Count'] = misspellings
    tweets['Judgement Disagreements'] = disagreements
    tweets['About Gender (Hatebase)'] = about_gender
    tweets['About Religion (Hatebase)'] = about_religion
    tweets['About Ethnicity (Hatebase)'] = about_ethnicity
    tweets['About Sexual Orientation (Hatebase)'] = about_sexual_orientation
    tweets['About Disability (Hatebase)'] = about_disability
    tweets['About Class (Hatebase)'] = about_social_class
    tweets['About Nationality (Hatebase)'] = about_nationality
    X = tweets[[col for col in tweets.columns if col!="tweet"]].values
    return X

Combine train and dev sets for k-fold cross validation

In [71]:
data = pd.concat([train_data, dev_data], sort=False)
tweets = data[['tweet']]
judgements = data[['count', 'hate_speech', 'offensive_language', 'neither']]

In [78]:
X = process_tweets(tweets, judgements)
y = data['class'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [79]:
X.shape

(22214, 23)

## Baseline Models

In [80]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
cv = KFold(n_splits=10, random_state=42, shuffle=False)

Run 10-fold cross validation on combined training and dev sets on LR, SVM, and NB models

In [81]:
LR = LogisticRegression(fit_intercept=True, max_iter=1000, solver='lbfgs', multi_class='ovr')
lr_y_pred = cross_val_predict(LR, X, y, cv=cv)

In [82]:
print("LR:", classification_report(y, lr_y_pred))

LR:               precision    recall  f1-score   support

           0       0.48      0.06      0.10      1282
           1       0.87      0.92      0.89     17186
           2       0.62      0.64      0.63      3746

    accuracy                           0.82     22214
   macro avg       0.66      0.54      0.54     22214
weighted avg       0.80      0.82      0.80     22214



In [83]:
svm = SVC(gamma='auto') 
svm_y_pred = cross_val_predict(svm, X, y, cv=cv)
print(classification_report(y, svm_y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1282
           1       0.87      0.92      0.89     17186
           2       0.63      0.65      0.64      3746

    accuracy                           0.82     22214
   macro avg       0.50      0.53      0.51     22214
weighted avg       0.78      0.82      0.80     22214



In [84]:
gnb = GaussianNB()
nb_y_pred = cross_val_predict(gnb, X, y, cv=cv)
print(classification_report(y, nb_y_pred))

              precision    recall  f1-score   support

           0       0.20      0.59      0.29      1282
           1       0.95      0.75      0.84     17186
           2       0.55      0.69      0.61      3746

    accuracy                           0.73     22214
   macro avg       0.56      0.68      0.58     22214
weighted avg       0.84      0.73      0.77     22214

