
# Race classification 

Sarah Santiago and Carlos Ortiz initially wrote this notebook. Jae Yeon Kim reviwed the notebook, edited the markdown, and reproduced, commented on and made minor changes in the code.

Racial demographic dialect predictions were made by the model developed by [Blodgett, S. L., Green, L., & O'Connor, B. (2016)](https://arxiv.org/pdf/1608.08868.pdf). We modified their predict function in [the public Git repository](https://github.com/slanglab/twitteraae) to work in the notebook environment. 

Code has been changed to Python3 by Aarjav Kothari, Dipti Sahu and Harsh Chobisa from intital Python 2 commitment.

In [1]:

# Install uninstalled libs 
import sys

# Import libraries
#!pip3 install seaborn

import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

## Language-demography model 

import predict


## Custom functions
from clean_text import clean_tweet

### Import tweets

In [2]:

# Import file 
tweets = pd.read_csv('./hatespeech_text_label_vote_RESTRICTED_100K.csv', sep='\t', header=None)

# Index variable 
tweets.index.name = 'ID'

# Name columns 
tweets.columns = ['Tweet', 'label', 'votes']

# First five rows
tweets.head()

Unnamed: 0_level_0,Tweet,label,votes
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Beats by Dr. Dre urBeats Wired In-Ear Headphon...,spam,4
1,RT @Papapishu: Man it would fucking rule if we...,abusive,4
2,It is time to draw close to Him &#128591;&#127...,normal,4
3,if you notice me start to act different or dis...,normal,5
4,"Forget unfollowers, I believe in growing. 7 ne...",normal,3


### Clean tweets

In [3]:

# Clean text
tweets_clean = tweets.copy()

tweets_clean['Tweet'] = clean_tweet(tweets_clean['Tweet'])

# Convert string into unicode 
tweets_clean['Tweet'] = tweets_clean['Tweet'] # Applied unicode for compatability with model

# First five rows
tweets_clean.head()

  document = document.str.replace(url_re, '') # Remove Links/URL
  document = document.str.replace(at_re, '') # Remove @
  document = document.str.replace(rt_re, '') # Remove rt
  document = document.str.replace(punct_re, '') # Remove Punctation


Unnamed: 0_level_0,Tweet,label,votes
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,beats by dr dre urbeats wired inear headphones...,spam,4
1,man it would fucking rule if we had a party ...,abusive,4
2,it is time to draw close to him 128591127995 f...,normal,4
3,if you notice me start to act different or dis...,normal,5
4,forget unfollowers i believe in growing 7 new ...,normal,3


### Apply predictions

In [4]:
# Predict function from the language-demography model 
predict.load_model()

def prediction(string):
    return predict.predict(string.split())


In [5]:
# Make predictions

predictions = tweets_clean['Tweet'].apply(prediction)

In [6]:
# Save prediction results to a new column 
tweets_clean['pred'] = predictions

# Fill tweets that have no predictions with NAs (NULL values)
tweets_clean = tweets_clean.fillna("NA") 

# First five rows
tweets_clean.head() 

Unnamed: 0_level_0,Tweet,label,votes,pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,beats by dr dre urbeats wired inear headphones...,spam,4,"[0.3780470050811405, 0.24955366776468596, 0.15..."
1,man it would fucking rule if we had a party ...,abusive,4,"[0.1937550310081168, 0.1927830199683899, 0.045..."
2,it is time to draw close to him 128591127995 f...,normal,4,"[0.16264463645151736, 0.39557843780118646, 0.0..."
3,if you notice me start to act different or dis...,normal,5,"[0.48375532062075605, 0.32020392760056654, 0.0..."
4,forget unfollowers i believe in growing 7 new ...,normal,3,"[0.11106622108756084, 0.17376976110216905, 0.0..."


- AAE: [African-American English](https://en.wikipedia.org/wiki/African-American_English) 
- WAE: White-aligned English 

In [7]:
def first_last(item):
    if item is 'NA':
        return 'NA'

    return np.array([item[0], item[3]]) # item[0] = AAE, item[3] = WAE

tweets_clean['pred_aae_wae'] = tweets_clean['pred'].apply(first_last)

tweets_clean.head()

  if item is 'NA':


Unnamed: 0_level_0,Tweet,label,votes,pred,pred_aae_wae
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,beats by dr dre urbeats wired inear headphones...,spam,4,"[0.3780470050811405, 0.24955366776468596, 0.15...","[0.3780470050811405, 0.21387724477304104]"
1,man it would fucking rule if we had a party ...,abusive,4,"[0.1937550310081168, 0.1927830199683899, 0.045...","[0.1937550310081168, 0.567846760366156]"
2,it is time to draw close to him 128591127995 f...,normal,4,"[0.16264463645151736, 0.39557843780118646, 0.0...","[0.16264463645151736, 0.4400849627220842]"
3,if you notice me start to act different or dis...,normal,5,"[0.48375532062075605, 0.32020392760056654, 0.0...","[0.48375532062075605, 0.19144012269954383]"
4,forget unfollowers i believe in growing 7 new ...,normal,3,"[0.11106622108756084, 0.17376976110216905, 0.0...","[0.11106622108756084, 0.6374406026707196]"


In [8]:
# Predicting racial demographic information using a binary category (whites or blacks)
def detect_two(item):
    if item is 'NA':
        return None
    
    if item[0] > item[1]:
        return 0
    
    else:
        return 1

# Predicting racial demographic information using a multiclass category (whites, blacks, and others; Others indicate Asian Americans and Latinx)
def detect_all(item):
    if item is "NA":
        return None

    if item[0] > item[1] and item[0] > item[2] and item[0] > item[3]:
        return 0
    
    elif item[3] > item[0] and item[3] > item[1] and item[3] > item[2]:
        return 1
    
    else:
        return 2
    
# Same as Model2 except saving only AAE values as a constinuous variable     
def detect_aae_cont(item):
    if item is "NA":
        return None

    if item[0] > item[1] and item[0] > item[2] and item[0] > item[3]:
        return item[0]
    
    else:
        return None 
    
# Same as Model2 except saving only WAE values as a constinuous variable
def detect_wae_cont(item):
    if item is "NA":
        return None

    if item[3] > item[0] and item[3] > item[1] and item[3] > item[2]:
        return item[3]
    
    else:
        return None 

  if item is 'NA':
  if item is "NA":
  if item is "NA":
  if item is "NA":


In [9]:

# Apply functions to the data 

tweets_clean['race_bi'] = tweets_clean['pred_aae_wae'].apply(detect_two)
tweets_clean['race_all'] = tweets_clean['pred'].apply(detect_all)
tweets_clean['aae_cont'] = tweets_clean['pred'].apply(detect_aae_cont)
tweets_clean['wae_cont'] = tweets_clean['pred'].apply(detect_wae_cont)

In [10]:

# Check 

tweets_clean['aae_cont'].head()

ID
0    0.378047
1         NaN
2         NaN
3    0.483755
4         NaN
Name: aae_cont, dtype: float64

In [11]:

# Drop columns 

final_tweets = tweets_clean.drop(columns=["pred", "pred_aae_wae"])

final_tweets['Tweet'] = tweets['Tweet']

final_tweets.head()

Unnamed: 0_level_0,Tweet,label,votes,race_bi,race_all,aae_cont,wae_cont
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Beats by Dr. Dre urBeats Wired In-Ear Headphon...,spam,4,0.0,0.0,0.378047,
1,RT @Papapishu: Man it would fucking rule if we...,abusive,4,1.0,1.0,,0.567847
2,It is time to draw close to Him &#128591;&#127...,normal,4,1.0,1.0,,0.440085
3,if you notice me start to act different or dis...,normal,5,0.0,0.0,0.483755,
4,"Forget unfollowers, I believe in growing. 7 ne...",normal,3,1.0,1.0,,0.637441


### Export tweets to CSV

In [12]:
final_tweets.columns

Index(['Tweet', 'label', 'votes', 'race_bi', 'race_all', 'aae_cont',
       'wae_cont'],
      dtype='object')

In [13]:
final_tweets.to_csv('./race_predictions.csv', sep=',', encoding='utf-8', 
                    header=["text", "label", "votes", "race_bi", "race_all", "aae_count", "wae_count"], index=True)