In [476]:
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from nltk.tokenize import word_tokenize
import contractions
from emot.emo_unicode import UNICODE_EMOJI, UNICODE_EMOJI_ALIAS, EMOTICONS_EMO
from flashtext import KeywordProcessor
from collections import Counter
from langdetect import detect
from sklearn.model_selection import train_test_split
#nltk.download("punkt")

In [452]:
def Tokenize(record):
    return word_tokenize(record)

In [453]:
def RemovePuncuation(record):
   return [word for word in record if word.isalpha()] 

In [454]:
def RemoveStopWords(record):
    return [word.lower() for word in record if word.lower() not in stopwords.words('english')]

In [455]:
def RemoveUrls(record, httpUrls, noHttpUrls):
    return [word for word in record if re.search(httpUrls, word) is None and re.search(noHttpUrls, word) is None]

In [456]:
def Lemmatize(record, lemmatizer):
    return map(lemmatizer.lemmatize, record)

In [457]:
def RemoveContractions(record):
    temp = [] 
    for word in record.split():
        temp.append(contractions.fix(word))
    return ' '.join(temp)

In [458]:
def UnwrapSlangExpressions(record, chat_expressions_dict):
    return re.sub(r'\S+', lambda m: chat_expressions_dict.get(m.group().upper(), m.group()) , record)

In [459]:
def ConvertEmojisAndEmoticonsToText(record, all_emoji_and_emoticons):
    return all_emoji_and_emoticons.replace_keywords(record)

In [460]:
def FindRareAndFrequentWords(dataFrame, minThreshhold = 0, maxThreshhold = 1000000):
    counts = Counter()
    dataFrame['Text'].str.lower().str.split().apply(counts.update)

    frequentWords = [word for word in counts if counts[word] >= maxThreshhold]
    rareWords = [word for word in counts if counts[word] <= minThreshhold]
    return rareWords, frequentWords

In [461]:
def RemoveRareAndFrequentWords(record, rareWords, frequentWords):
    return [word for word in record if word not in rareWords and word not in frequentWords]

In [462]:
def RemoveNonEnglishRecord(record):
    if(record):
        return record if detect(record) == 'en' else ''

In [463]:
httpUrls = "^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$"
noHttpUrls = "^[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$"

tokenizer = RegexpTokenizer(r'\w+')

lemmatizer = WordNetLemmatizer()

chat_expressions_url = "https://raw.githubusercontent.com/MFuchs1989/Datasets-and-Miscellaneous/main/datasets/NLP/Text%20Pre-Processing%20VII%20(Special%20Cases)/chat_expressions.csv" 
chat_expressions = pd.read_csv(chat_expressions_url, on_bad_lines='skip')
chat_expressions_dict = dict(zip(chat_expressions.Chat_Words, chat_expressions.Chat_Words_Extended))

all_emoji_emoticons = {**EMOTICONS_EMO,**UNICODE_EMOJI_ALIAS, **UNICODE_EMOJI_ALIAS}
all_emoji_emoticons = {k:v.replace(":","").replace("_"," ").strip() for k,v in all_emoji_emoticons.items()}
kp_all_emoji_emoticons = KeywordProcessor()
for k,v in all_emoji_emoticons.items():
    kp_all_emoji_emoticons.add_keyword(k, v)



In [464]:
def Preprocess(dataFrame):
    for index, row in dataFrame.iterrows():
        dataFrame.at[index, 'Text'] = ConvertEmojisAndEmoticonsToText(dataFrame.at[index, 'Text'], kp_all_emoji_emoticons)
        dataFrame.at[index, 'Text'] = UnwrapSlangExpressions(dataFrame.at[index, 'Text'], chat_expressions_dict)
        dataFrame.at[index, 'Text'] = RemoveContractions(dataFrame.at[index, 'Text'])
        dataFrame.at[index, 'Text'] = Tokenize(dataFrame.at[index, 'Text'])
        dataFrame.at[index, 'Text'] = RemoveUrls(dataFrame.at[index, 'Text'], httpUrls, noHttpUrls)
        dataFrame.at[index, 'Text'] = RemoveStopWords(dataFrame.at[index, 'Text'])
        dataFrame.at[index, 'Text'] = RemovePuncuation(dataFrame.at[index, 'Text'])
        dataFrame.at[index, 'Text'] = Lemmatize(dataFrame.at[index, 'Text'], lemmatizer)
        dataFrame.at[index, 'Text'] = ' '.join(dataFrame.at[index, 'Text'])
        dataFrame.at[index, 'Text'] = RemoveNonEnglishRecord(dataFrame.at[index, 'Text'])

    return dataFrame

In [465]:
def PreprocessPartTwo(dataFrame):
    rareWords, frequentWords = FindRareAndFrequentWords(dataFrame, minThreshhold=1)
    for index, row in dataFrame.iterrows():
        dataFrame.at[index, 'Text'] = Tokenize(dataFrame.at[index, 'Text'])
        dataFrame.at[index, 'Text'] = RemoveRareAndFrequentWords(dataFrame.at[index, 'Text'], rareWords, frequentWords)
        dataFrame.at[index, 'Text'] = ' '.join(dataFrame.at[index, 'Text'])

    return dataFrame

In [466]:
def ClearEmptyValues(dataFrame):
    dataFrame = dataFrame[dataFrame.Text != '']
    dataFrame.dropna(how='any', inplace=True)
    return dataFrame

In [467]:
def WritePreprocessedData(prepprocessedData, fileName):
    prepprocessedData.to_csv('Datasets/ProcessedData/' + fileName + '.csv', columns=['Emotion', 'Text'], index = False)

In [468]:
def ProcessDataSet(dataFrame, fileName):
    dataFrame = (
    dataFrame
    .pipe(Preprocess)
    .pipe(ClearEmptyValues)
    .pipe(PreprocessPartTwo)
    .pipe(ClearEmptyValues))

    WritePreprocessedData(dataFrame, fileName)

In [470]:
binaryDataset = pd.read_csv("Datasets/RawData/binary-sentiment.csv")
linesBinaryDataset = binaryDataset.reset_index()
print(len(linesBinaryDataset))
ProcessDataSet(linesBinaryDataset, 'binaryPreprocessedData')

63900


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataFrame.dropna(how='any', inplace=True)


In [472]:
linesmultiClassDataset = pd.read_csv('Datasets/RawData/merged.csv')
linesmultiClassDataset = linesmultiClassDataset.reset_index()
ProcessDataSet(linesmultiClassDataset, 'multiClassPreprocessedData')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataFrame.dropna(how='any', inplace=True)


In [477]:
def CreateTrainTestSplit(datasetType):
    df = pd.read_csv('Datasets/ProcessedData/' + datasetType + '.csv')
    df = df.reset_index()

    X_train, X_test, y_train, y_test = train_test_split(df.Text, df.Emotion, 
                            random_state=22,
                            test_size=0.3, shuffle=True)

    train_tuples = list(zip(y_train,X_train))
    train_df = pd.DataFrame(train_tuples, columns=["Emotion", "Text"])

    test_tuples = list(zip(y_test,X_test))
    test_df = pd.DataFrame(test_tuples, columns=["Emotion", "Text"])

    WritePreprocessedData(train_df, "Train/" + datasetType)
    WritePreprocessedData(test_df, "Test/" + datasetType)


In [478]:
CreateTrainTestSplit("binaryPreprocessedData")
CreateTrainTestSplit("multiClassPreprocessedData")