In [None]:
import nltk
import inflect
import contractions
import re 
import string 
import unicodedata
import joblib
import warnings
import pickle

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
warnings.filterwarnings("ignore")

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')

In [None]:
def removeNoise(text):
    text = text.lower()
    parser = BeautifulSoup(text, "html.parser")
    text = parser.get_text()
    text = contractions.fix(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text

def tokenize(text):
    return nltk.word_tokenize(text)

def removeNonaASCII(words):
    newWords = []
    for word in words:
        newWord = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        newWords.append(newWord)
    return newWords

def toLowercase(words):
    newWords = []
    for word in words:
        newWord = word.lower()
        newWords.append(newWord)
    return newWords

def removePunctuation(words):
    newWords = []
    for word in words:
        newWord = re.sub(r'[^\w\s]', '', word)
        if newWord != '':
            newWords.append(newWord)
    return newWords

def replaceNumbers(words):
    engine = inflect.engine()
    newWords = []
    for word in words:
        if word.isdigit():
            newWord = engine.number_to_words(word)
            newWords.append(newWord)
        else:
            newWords.append(word)
    return newWords

def removeStopwords(words):
    newWords = []
    for word in words:
        if word not in stopwords.words('english'):
            newWords.append(word)
    return newWords

def stemWords(words):
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatizeVerbs(words):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalizeText(words):
    words = removeNonaASCII(words)
    words = toLowercase(words)
    words = removePunctuation(words)
    words = removeStopwords(words)
    words = stemWords(words)
    words = lemmatizeVerbs(words)
    return words

def processText(text):
    text = removeNoise(text)
    text = ' '.join(normalizeText(tokenize(text)))
    return text

In [None]:
data = pd.read_csv("./dataset/FYP_train.csv")

data = data[["text", "target"]]
data.rename(columns = {"text" : "Message", "target" : "Label"}, inplace = True)

print("Number of rows in data:", data.shape[0])
print("Number of columns in data:", data.shape[1])

data.head()

In [None]:
data["Message"].apply(processText)
data.head()

In [None]:
data.to_csv("./dataset/FYP_train_processed.csv")

In [None]:
vectorizer = TfidfVectorizer(
    encoding="utf-8", 
    strip_accents="unicode", 
    stop_words="english", 
    lowercase=True, 
    max_features=500
)
tfidf_result = vectorizer.fit_transform(data["Message"])
tfidf_result.shape

In [None]:
pickle.dump(vectorizer, open('./models/message_vectorizer.pkl', 'wb'))

In [None]:
FYP_train_X_TFIDF = tfidf_result.toarray()
FYP_train_Y = data["Label"].values

np.save("./dataset/FYP_train_X_TFIDF.npy", FYP_train_X_TFIDF)
np.save("./dataset/FYP_train_Y.npy", FYP_train_Y)

In [None]:
embeddings = {}
glove = open("./dataset/glove.6B.50d.txt", encoding="utf8")
for line in glove:
    values = line.split()
    word = values[0]
    try:
        C = np.asarray(values[1:], dtype='float32')
    except:
        pass
    embeddings[word] = C  
glove.close()
pickle.dump(embeddings, open('./dataset/glove_embeddings.pkl', 'wb'))
print(f'{len(embeddings)} Word vectors')

In [None]:
tokenizer = Tokenizer(num_words=75000)
tokenizer.fit_on_texts(data["Message"])
sequences = tokenizer.texts_to_sequences(data["Message"])
paddedSequence = pad_sequences(sequences, maxlen=500)
wordIndex = tokenizer.word_index
print(f'{len(wordIndex)} Unique tokens')
pickle.dump(tokenizer, open('./models/message_tokenizer.pkl', 'wb'))

In [None]:
FYP_train_X_SEQ = paddedSequence

np.save("./dataset/FYP_train_X_SEQ.npy", FYP_train_X_SEQ)

In [None]:
FYP_train_X_SEQ.shape

In [None]:
FYP_train_X_TFIDF.shape