## Exploratory Data Analysis

In [68]:
# import libraries and data
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
import plotly.express as px
from collections import Counter, OrderedDict
import itertools
import pickle
import numpy as np

# Import necessary files
train_file = 'hateful_memes/train.jsonl'
test_file = 'hateful_memes/test_seen.jsonl'

# These files can be created using the 
# Preprocess_Meme_Image_and_Text_Data.ipynb notebook
# or downloaded here: https://drive.google.com/drive/folders/1gUuicGA6Gnzh1hYdAsE9powrQYuCTvyp?usp=sharing
train_combo = 'hateful_memes/pickles/training_combo.p'
train_combo_labels = 'hateful_memes/pickles/training_labels_combo.p'

train_data = pd.read_json(train_file, lines = True)
test_data = pd.read_json(test_file, lines = True)
test_data.head()

Unnamed: 0,id,img,label,text
0,16395,img/16395.png,1,handjobs sold seperately
1,37405,img/37405.png,1,introducing fidget spinner for women
2,94180,img/94180.png,1,happy pride month let's go beat up lesbians
3,54321,img/54321.png,1,laughs in [majority of u.s crime rate]
4,97015,img/97015.png,1,finds out those 72 virgins.. are goats


#### Look at the class balance of the training data

In [21]:
balance = pd.DataFrame(train_data.label.value_counts())
balance.rename(columns = {'label': 'Count'}, inplace=True)
balance['Classification'] = ['not hate speech','hate speech']
balance

Unnamed: 0,Count,Classification
0,5481,not hate speech
1,3019,hate speech


In [23]:
bal_bar = px.bar(balance,
                 x='Classification',
                 y='Count',
                title = 'Training Class balance')
bal_bar.update_layout(
    {
       'plot_bgcolor': 'rgba(0,0,0,0)',
        'paper_bgcolor':'rgba(0,0,0,0)' 
    }
)
bal_bar.show()

In [24]:
print(f'{round(len(train_data[train_data.label == 1])/len(train_data), 2) * 100}% of training data is hateful')
print(f'{round(len(train_data[train_data.label == 0])/len(train_data), 2) * 100}% of training data is nonhateful')

36.0% of training data is hateful
64.0% of training data is nonhateful


#### Preprocess the text

In [25]:
# preprocess the text
train_data['text_preprocess'] = train_data.text.apply(preprocess_string)
train_data.head()

Unnamed: 0,id,img,label,text,text_preprocess
0,42953,img/42953.png,0,its their character not their color that matters,"[charact, color, matter]"
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...,"[afraid, love, like]"
2,13894,img/13894.png,0,putting bows on your pet,"[put, bow, pet]"
3,37408,img/37408.png,0,i love everything and everybody! except for sq...,"[love, everybodi, squirrel, hate, squirrel]"
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h...","[everybodi, love, chocol, chip, cooki, hitler]"


#### Look at the top words in all the training data

In [30]:
# Get the counts of how many times each word shows up in the data
all_text = list(itertools.chain(*list(train_data.text_preprocess)))
all_text_counts = Counter(all_text)

# Put the dictionary in descending order
all_text_counts_desc = dict(OrderedDict(sorted(all_text_counts.items(), key=lambda x: x[1], reverse=True)))

# Create a dataframe
all_text_counts_df = pd.DataFrame.from_dict(all_text_counts_desc, orient='index')
all_text_counts_df = all_text_counts_df.reset_index()
all_text_counts_df.rename(columns={0:'Counts', 'index': 'Word'}, inplace=True)

all_text_counts_df.head()

Unnamed: 0,Word,Counts
0,like,607
1,peopl,570
2,muslim,435
3,black,409
4,fuck,379


In [35]:
# Create a bar chart with the top 20 words
all_bar = px.bar(all_text_counts_df[:20],
                x='Word',
                y='Counts',
                title='Top 20 words in the test memes: All data')
all_bar.update_layout(
    {
        'plot_bgcolor': 'rgba(0,0,0,0)',
        'paper_bgcolor':'rgba(0,0,0,0)' 
    }
)
all_bar.show()

#### Look at the top words in the hateful and non-hateful memes separately

In [33]:
# Get the counts of how many times each word shows up in the data
hateful = list(itertools.chain(*list(train_data.loc[train_data.label == 1, 'text_preprocess'])))
hateful_counts = Counter(hateful)
hateful_counts_desc = dict(OrderedDict(sorted(hateful_counts.items(), key=lambda x: x[1], 
                                  reverse = True)))


nonhateful = list(itertools.chain(*list(train_data.loc[train_data.label == 0, 'text_preprocess'])))
nonhateful_counts = Counter(nonhateful)
nonhateful_counts_desc = dict(OrderedDict(sorted(nonhateful_counts.items(), key=lambda x: x[1], 
                                  reverse = True)))

# Create the dataframes
hateful_df = pd.DataFrame.from_dict(hateful_counts_desc, orient='index')
hateful_df = hateful_df.reset_index()
hateful_df.rename(columns={0:'Counts', 'index': 'Word'}, inplace=True)

nonhateful_df = pd.DataFrame.from_dict(nonhateful_counts_desc, orient='index')
nonhateful_df = nonhateful_df.reset_index()
nonhateful_df.rename(columns={0:'Counts', 'index': 'Word'}, inplace=True)


hateful_df.head()

Unnamed: 0,Word,Counts
0,muslim,322
1,black,292
2,peopl,286
3,white,247
4,like,230


In [36]:
# Create the hateful bar chart
hateful_bar = px.bar(hateful_df[:20],
                    x='Word',
                    y='Counts',
                    title='Top 20 words in the test memes: Hateful classication')
hateful_bar.update_layout(
    {
        'plot_bgcolor': 'rgba(0,0,0,0)',
        'paper_bgcolor':'rgba(0,0,0,0)' 
    }
)

hateful_bar.show()

In [38]:
# Create the nonhateful bar
nonhateful_bar = px.bar(nonhateful_df[:20],
                       x='Word',
                       y='Counts',
                       title='Top 20 words in the test memes: Non-hateful classication')
nonhateful_bar.update_layout(
    {
        'plot_bgcolor': 'rgba(0,0,0,0)',
        'paper_bgcolor':'rgba(0,0,0,0)' 
    }
)
nonhateful_bar.show()

##### Race, gender, and religion seems to be more prevalent in the hateful memes vs the nonhateful memes

## Look at class separation with text and image representation

In [39]:
# Import TSNE
from sklearn.manifold import TSNE

In [69]:
# Get the data
def get_pickle(picklepath):
    with open(picklepath, 'rb') as filehandler:
        data = pickle.load(filehandler)
    return data

X = np.array(get_pickle(train_combo))
y = np.array(get_pickle(train_combo_labels))

X.shape

(120, 1768)

In [70]:
# Set up the TSNE
n_components = 2
tsne = TSNE(n_components)
tsne_result = tsne.fit_transform(X)
tsne_result.shape

(120, 2)

In [71]:
# Set up the dataframe
tsne_Df = pd.DataFrame({'tnse_1': tsne_result[:,0], 
                        'tsne_2': tsne_result[:,1],
                       'label': y})
tsne_Df['label'] = tsne_Df['label'].replace({0:'Not hate speech', 
                                            1: 'Hate speech'})

In [72]:
# Create the plot
tsne_fig = px.scatter(tsne_Df,
                     x='tnse_1',
                     y='tsne_2',
                     color='label',
                     color_discrete_sequence=['orange', 'blue'],
                     title='Multimodal TSNE Plot')

tsne_fig.update_layout(
    {
        'plot_bgcolor': 'rgba(0,0,0,0)',
        'paper_bgcolor':'rgba(0,0,0,0)' 
    }
)
tsne_fig.show()