<a href="https://colab.research.google.com/github/povembu/NLP-project-D590/blob/main/NLP_Project_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project Part 2 Short Version
# Text Classification of Online Hate Speech

## By Binh Bui, Pooja Rajan, and Alexander Watkins


## Dataset Prep

### Dataset Loading

In [1]:
!pip install datasets #install datasets library for HuggingFace

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.5


In [2]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import spacy
import unicodedata
import re
from nltk.corpus import wordnet
from nltk.tokenize.toktok import ToktokTokenizer
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
dataset = load_dataset('hate_speech18')

Downloading builder script:   0%|          | 0.00/3.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.61k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.19M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10944 [00:00<?, ? examples/s]

### Dataset Reformatting

In [4]:
dataset.set_format(type='pandas',columns=['text', 'user_id', 'subforum_id', 'num_contexts', 'label'])
hate_df = dataset['train'][:]

### Dataset Filtering

In [5]:
hate_df = hate_df[hate_df['label']!=2]
hate_df = hate_df[hate_df['label']!=3]
hate_df = hate_df[['text','label']]

## Train/test split

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(hate_df['text'], hate_df['label'], test_size=0.2, random_state=42, stratify=hate_df['label'])

In [7]:
train_X = np.array(X_train)
test_X = np.array(X_test)
train_y = np.array(y_train)
test_y = np.array(y_test)

## Text cleaning and normalization

In [8]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')

In [9]:
nlp = spacy.load('en_core_web_sm')
tokenizer = ToktokTokenizer()

In [10]:
def simple_porter_stemming(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]|\[|\]' if not remove_digits else r'[^a-zA-Z\s]|\[|\]'
    text = re.sub(pattern, '', text)
    return text

def remove_stopwords(text, is_lower_case=False, stopwords=stop_words):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def normalize_text(corpus, text_lower_case=True,
                     text_stemming=False, text_lemmatization=True,
                     special_char_removal=True, remove_digits=True,
                     stopword_removal=True,stopwords=stop_words):

    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:

        # remove extra newlines
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))

        # lemmatize text
        if text_lemmatization:
          doc = lemmatize_text(doc)

        # stem text
        if text_stemming and not text_lemmatization:
          doc = simple_porter_stemming(doc)

        # remove special characters and\or digits
        if special_char_removal:
          # insert spaces between special characters to isolate them
          special_char_pattern = re.compile(r'([{.(-)!}])')
          doc = special_char_pattern.sub(" \\1 ", doc)
          doc = remove_special_characters(doc, remove_digits=remove_digits)

        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)

         # lowercase the text
        if text_lower_case:
          doc = doc.lower()

        # remove stopwords
        if stopword_removal:
          doc = remove_stopwords(doc, is_lower_case=text_lower_case, stopwords=stopwords)

        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()

        normalized_corpus.append(doc)

    return normalized_corpus

In [11]:
norm_train_X = normalize_text(train_X, stopwords=stop_words)
norm_test_X = normalize_text(test_X, stopwords=stop_words)

## Vectorization

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# build BOW features on train text
cv = CountVectorizer(ngram_range=(1,2),stop_words=stop_words).fit(norm_train_X)
cv_train_features = cv.transform(norm_train_X)
# transform test reviews into features
cv_test_features = cv.transform(norm_test_X)

In [13]:
cv_train_X = pd.DataFrame(cv_train_features.toarray(), columns=cv.get_feature_names_out())
cv_test_X = pd.DataFrame(cv_test_features.toarray(), columns=cv.get_feature_names_out())

## Model Creation
- X:
  * CountVectorizer: cv_train_X, cv_test_X
- y: train_y, test_y

In [14]:
from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.model_selection import cross_val_score

#### Logistic Regression using CountVectorizer

In [15]:
#fit the model
lr_cv = LogisticRegression(class_weight ='balanced',random_state=42)

lr_cv.fit(cv_train_X, train_y)

#predict using the trained model
y_pred_cv_lr = lr_cv.predict(cv_test_X)

In [16]:
lr_cv_report = (metrics.classification_report(y_test,y_pred_cv_lr,labels=[0,1]))
print(lr_cv_report)

              precision    recall  f1-score   support

           0       0.93      0.96      0.94      1902
           1       0.57      0.40      0.47       239

    accuracy                           0.90      2141
   macro avg       0.75      0.68      0.71      2141
weighted avg       0.89      0.90      0.89      2141



## Classification test using best model (LR - CV)

In [17]:
#normalize input text and convert features into vectors
text_sample=["new foreigners , who reckon they can live amongest superiour , pagan blood"]

norm_text = normalize_text(text_sample)

cv_text = cv.transform(norm_text)
scores = lr_cv.predict_proba(cv_text)



In [18]:
print(text_sample)
if scores[:,1] > 0.5:
  print("This comment contains hate")
else:
  print("Not a hate comment")

['new foreigners , who reckon they can live amongest superiour , pagan blood']
This comment contains hate


In [19]:
#normalize input text and convert features into vectors
text_sample=["This whole situation starts to become really bizarre ."]

norm_text = normalize_text(text_sample)

cv_text = cv.transform(norm_text)
scores = lr_cv.predict_proba(cv_text)

In [20]:
print(text_sample)
if scores[:,1] > 0.5:
  print("This comment contains hate")
else:
  print("Not a hate comment")

['This whole situation starts to become really bizarre .']
Not a hate comment
