In [10]:
import pandas as pd
import nltk
import spacy
import re
import contractions

## Import Data:

In [2]:
filepath = "../raw_data/dreaddit-train.csv"
test_filepath = "../raw_data/dreaddit-test.csv"

df = pd.read_csv(filepath)
df_test = pd.read_csv(test_filepath)

In [3]:
columns = ['text','label']

df = df[columns]

X_train = df[['text']]
y_train = df[['label']]

df_test = df_test[columns]
X_test = df_test[['text']]
y_test = df_test[['label']]

## Preprocessing:

In [4]:
nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

X_train['lem_text'] = X_train['text'].apply(lambda x: lemmatize_text(x))
X_test['lem_text'] = X_test['text'].apply(lambda x: lemmatize_text(x))

In [5]:
X_train.head()

Unnamed: 0,text,lem_text
0,"He said he had not felt that way before, sugge...","he say he have not feel that way before , sugg..."
1,"Hey there r/assistance, Not sure if this is th...","hey there r / assistance , not sure if this be..."
2,My mom then hit me with the newspaper and it s...,my mom then hit I with the newspaper and it sh...
3,"until i met my new boyfriend, he is amazing, h...","until I meet my new boyfriend , he be amazing ..."
4,October is Domestic Violence Awareness Month a...,October be Domestic Violence Awareness Month a...


In [6]:
texts = list(X_train.columns)
texts

['text', 'lem_text']

In [7]:
from nltk.tokenize.toktok import ToktokTokenizer

tokenizer = ToktokTokenizer()

stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
stopword_list.remove('nor')

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]|\[|\]' if not remove_digits else r'[^a-zA-Z\s]|\[|\]'
    text = re.sub(pattern, '', text)
    return text

def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    text = ' '.join(filtered_tokens)
    return text


def normalize_text(text, 
#                    html_stripping=True, 
#                    contraction_expansion_2=True,
#                    accented_char_removal=True, 
#                    text_lower_case=True,
#                    text_stemming=False,
                   text_lemmatization=True,
#                    special_char_removal=True,
#                    remove_digits=True,
                   stopword_removal=True,
                   stopwords=stopword_list):    # lemmatize text
    
    if text_lemmatization:
        text = lemmatize_text(text)

    # remove special characters and\or digits
#     if special_char_removal:
#         # insert spaces between special characters to isolate them
#         special_char_pattern = re.compile(r'([{.(-)!}])')
#         text = special_char_pattern.sub(" \\1 ", text)
#         text = remove_special_characters(text, remove_digits=remove_digits)

    # remove stopwords
    if stopword_removal:
        text = remove_stopwords(text, 
                                #is_lower_case=text_lower_case,
                                stopwords=stopwords)
    
    return text

In [11]:
import nltk
import spacy
import unicodedata
from contractions import contractions_dict
import re
from nltk.corpus import wordnet
import collections
#from textblob import Word
from nltk.tokenize.toktok import ToktokTokenizer

tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
nlp = spacy.load("en_core_web_sm")
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
stopword_list.remove('nor')



#def correct_spellings_textblob(tokens):
#	return [Word(token).correct() for token in tokens]

def simple_porter_stemming(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_repeated_characters(text):
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word

    tokens = nltk.word_tokenize(text)
    correct_tokens = [replace(word) for word in tokens]
    text = ' '.join(correct_tokens)
    return text

def expand_contractions_2(text,contractions_dict=contractions_dict):
    for word in text.split():
        if word.lower() in contractions_dict:
            text = text.replace(word, contractions_dict[word.lower()])
    return text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]|\[|\]' if not remove_digits else r'[^a-zA-Z\s]|\[|\]'
    text = re.sub(pattern, '', text)
    return text

def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    text = ' '.join(filtered_tokens)
    return text

def normalize_text(text, contraction_expansion_2=True,
                     accented_char_removal=True, text_lower_case=True,
                     text_stemming=False, text_lemmatization=True,
                     special_char_removal=True, remove_digits=True,
                     stopword_removal=True, stopwords=stopword_list):



    # remove extra newlines
    text = text.translate(text.maketrans("\n\t\r", "   "))

    # remove accented characters
    if accented_char_removal:
        text = remove_accented_chars(text)

    # expand contractions
    if contraction_expansion_2:
        text = expand_contractions_2(text)

    # lemmatize text
    if text_lemmatization:
        text = lemmatize_text(text)

    # stem text
    if text_stemming and not text_lemmatization:
        text = simple_porter_stemming(text)

    # remove special characters and\or digits
    if special_char_removal:
        # insert spaces between special characters to isolate them
        special_char_pattern = re.compile(r'([{.(-)!}])')
        text = special_char_pattern.sub(" \\1 ", text)
        text = remove_special_characters(text, remove_digits=remove_digits)

    # remove extra whitespace
    text = re.sub(' +', ' ', text)

    # lowercase the text
    if text_lower_case:
        text = text.lower()

    # remove stopwords
    if stopword_removal:
        text = remove_stopwords(text, is_lower_case=text_lower_case, stopwords=stopwords)

    # remove extra whitespace
    text = re.sub(' +', ' ', text)
    text = text.strip()

    return text

In [12]:
X_train['lem_text'] = X_train['text'].apply(lambda x: lemmatize_text(x))
X_test['lem_text'] = X_test['text'].apply(lambda x: lemmatize_text(x))

X_train['norm_text'] = X_train['text'].apply(lambda x: normalize_text(x))
X_test['norm_text'] = X_test['text'].apply(lambda x: normalize_text(x))

## Models:

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
    
vectorizer = TfidfVectorizer()

X_train_vector = vectorizer.fit_transform(X_train.text)
X_test_vector = vectorizer.transform(X_test.text)

X_train_vector_lemm = vectorizer.fit_transform(X_train.lem_text)
X_test_vector_lemm = vectorizer.transform(X_test.lem_text)

X_train_vector_norm = vectorizer.fit_transform(X_train.norm_text)
X_test_vector_norm = vectorizer.transform(X_test.norm_text)

data_sets = [(X_train_vector, X_test_vector), 
        (X_train_vector_lemm, X_test_vector_lemm),
        (X_train_vector_norm, X_test_vector_norm)]
names = ['basic', 'lemma', 'norm']

In [14]:
print(X_train_vector.shape, y_train.shape)

(2838, 11516) (2838, 1)


In [15]:
# Try to fit and evaluate a Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB

model_NB = MultinomialNB()

for i, data in enumerate(data_sets):
    model_NB.fit(data[0], y_train.label)
    score = model_NB.score(data[1],y_test.label)
    print(names[i],score)

basic 0.6461538461538462
lemma 0.6643356643356644
norm 0.6559440559440559


In [16]:
from sklearn.svm import SVC

kernels = ['rbf', 'poly']
for i, data in enumerate(data_sets):
    for kernel in kernels:
        model_SVC = SVC(kernel= kernel)
        model_SVC.fit(data[0], y_train.label)
        score = model_SVC.score(data[1],y_test.label)
        print(names[i], kernel, round(score, 3), sep=', ')

basic, rbf, 0.722
basic, poly, 0.692
lemma, rbf, 0.733
lemma, poly, 0.705
norm, rbf, 0.733
norm, poly, 0.627


In [17]:
from sklearn.ensemble import GradientBoostingClassifier

model_gb = GradientBoostingClassifier()
#     random_state=4, subsample=0.8, max_features="auto", warm_start=True)

for i, data in enumerate(data_sets):
        model_gb.fit(data[0], y_train.label)
        score = model_gb.score(data[1],y_test.label)
        print(names[i], round(score, 3), sep=', ')

basic, 0.708
lemma, 0.697
norm, 0.701


In [18]:
X_train.head()

Unnamed: 0,text,lem_text,norm_text
0,"He said he had not felt that way before, sugge...","he say he have not feel that way before , sugg...",say not feel way suggete go rest trigger ahead...
1,"Hey there r/assistance, Not sure if this is th...","hey there r / assistance , not sure if this be...",hey r assistance not sure right place post go ...
2,My mom then hit me with the newspaper and it s...,my mom then hit I with the newspaper and it sh...,mom hit newspaper shock would know not like pl...
3,"until i met my new boyfriend, he is amazing, h...","until I meet my new boyfriend , he be amazing ...",meet new boyfriend amazing kind sweet good stu...
4,October is Domestic Violence Awareness Month a...,October be Domestic Violence Awareness Month a...,october domestic violence awareness month dome...
