<a href="https://colab.research.google.com/github/povembu/NLP-project-D590/blob/main/nlp_project_model_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hate Speech Text Classification

## Load Packages

In [1]:
import numpy as np
import pandas as pd
import spacy
import unicodedata
import re
from nltk.corpus import wordnet
from nltk.tokenize.toktok import ToktokTokenizer
import nltk
import pickle
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Prepare NLP Functions

In [2]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')

In [3]:
nlp = spacy.load('en_core_web_sm')
tokenizer = ToktokTokenizer()

In [4]:
def simple_porter_stemming(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]|\[|\]' if not remove_digits else r'[^a-zA-Z\s]|\[|\]'
    text = re.sub(pattern, '', text)
    return text

def remove_stopwords(text, is_lower_case=False, stopwords=stop_words):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def normalize_text(corpus, text_lower_case=True,
                     text_stemming=False, text_lemmatization=True,
                     special_char_removal=True, remove_digits=True,
                     stopword_removal=True,stopwords=stop_words):

    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:

        # remove extra newlines
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))

        # lemmatize text
        if text_lemmatization:
          doc = lemmatize_text(doc)

        # stem text
        if text_stemming and not text_lemmatization:
          doc = simple_porter_stemming(doc)

        # remove special characters and\or digits
        if special_char_removal:
          # insert spaces between special characters to isolate them
          special_char_pattern = re.compile(r'([{.(-)!}])')
          doc = special_char_pattern.sub(" \\1 ", doc)
          doc = remove_special_characters(doc, remove_digits=remove_digits)

        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)

         # lowercase the text
        if text_lower_case:
          doc = doc.lower()

        # remove stopwords
        if stopword_removal:
          doc = remove_stopwords(doc, is_lower_case=text_lower_case, stopwords=stopwords)

        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()

        normalized_corpus.append(doc)

    return normalized_corpus

## Load Model and Training Data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# load processed training data
with open('/content/drive/My Drive/Colab Notebooks/NLP Class/Final Project/train_data.pkl','rb') as f:
    norm_train_X = pickle.load(f)

In [7]:
# build BOW features on train text
cv = CountVectorizer(ngram_range=(1,2),stop_words=stop_words).fit(norm_train_X)

In [8]:
# load saved logistic regression model
with open('/content/drive/My Drive/Colab Notebooks/NLP Class/Final Project/lr_model.pkl','rb') as f:
    lr_cv_load = pickle.load(f)

## Classify Text

In [None]:
#grab text sample from user imput
#text_sample = user.input()

In [9]:
#normalize input text and convert features into vectors
text_sample=["new foreigners , who reckon they can live amongest superiour , pagan blood"]

norm_text = normalize_text(text_sample)

cv_text = cv.transform(norm_text)
scores = lr_cv_load.predict_proba(cv_text)



In [10]:
print(text_sample)
if scores[:,1] > 0.5:
  print("This comment contains hate")
else:
  print("Not a hate comment")

['new foreigners , who reckon they can live amongest superiour , pagan blood']
This comment contains hate
