# <center> Hate Speech Preprocessing pipeline</center>

## Imports

In [1]:
import os
import shutil
import time
import csv
import numpy as np
import pandas as pd
import sklearn
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import string
import re


In [2]:
#nltk.download('stopwords')

## Functions

In [3]:
def clean_text(text):
    # Remove punctuation and lower text
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    
    # Remove numeric numbers
    text = re.sub('[0-9]+', '', text)
    
    # Tokenization
    tokens = re.split('\W+', text)
    
    # Remove stopwords
    stopword = nltk.corpus.stopwords.words('english')
    stopword.extend(['user'])
    
    # Stemming (ex treated -> treat)
    ps = nltk.PorterStemmer()
    text = [ps.stem(word) for word in tokens if word not in stopword]
    
    return text

## Data Loading

In [4]:
df = pd.read_csv('../data/olid/olid-training-v1.0.tsv', sep="\t")
df = df.rename(columns={'tweet': 'text', 'subtask_a': 'labels'})
df = df[['text', 'labels']]
df

Unnamed: 0,text,labels
0,@USER She should ask a few native Americans wh...,OFF
1,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF
2,Amazon is investigating Chinese employees who ...,NOT
3,"@USER Someone should'veTaken"" this piece of sh...",OFF
4,@USER @USER Obama wanted liberals &amp; illega...,NOT
...,...,...
13235,@USER Sometimes I get strong vibes from people...,OFF
13236,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,NOT
13237,@USER And why report this garbage. We don't g...,OFF
13238,@USER Pussy,OFF


## Data Cleaning and Tokenization

In [5]:
# Remove punctuation and numbers
df['text'] = df['text'].apply(lambda x: clean_text(x))
df

Unnamed: 0,text,labels
0,"[ask, nativ, american, take]",OFF
1,"[go, home, drunk, maga, trump, url]",OFF
2,"[amazon, investig, chines, employe, sell, inte...",NOT
3,"[someon, shouldvetaken, piec, shit, volcano, ]",OFF
4,"[obama, want, liber, amp, illeg, move, red, st...",NOT
...,...,...
13235,"[sometim, get, strong, vibe, peopl, man, vibe,...",OFF
13236,"[benidorm, creamfield, maga, shabbi, summer]",NOT
13237,"[report, garbag, dont, give, crap]",OFF
13238,[pussi],OFF


## Vectorization

In [6]:
countVectorizer = CountVectorizer(analyzer=clean_text) 
countVector = countVectorizer.fit_transform(df['text'])
print('{} Number of tweets has {} words'.format(countVector.shape[0], countVector.shape[1]))

13240 Number of tweets has 12868 words


In [7]:
countVector.shape

(13240, 12868)