#### Natural Language Processing (NLP) Pre-processing/cleaning function

In [2]:
def keep_words_only(text):
    """Cleans text so that any special characters are removed and only words are kept"""
    
    regex_tokenizer = RegexpTokenizer("\w+")
    words = regex_tokenizer.tokenize(text.lower())
    
    return (' ').join(words)

In [4]:
def clean_text_stem(text):
    """Cleans text by keeping words only, tokenizing, stemming and removing stopwords"""
    #Instantiate tokenizer and stemmer and lemmatizer
    re_tokenizer = RegexpTokenizer("\w+")
    lemmatizer = WordNetLemmatizer()
    p_stemmer = PorterStemmer()
        
    # Tokenze the text
    words = re_tokenizer.tokenize(text.lower())
    
    # Filter stop words
    stopwords_list = stopwords.words('english')
    
    # Adds names to stopwords_list
#     names = ['harry', 'potter', 'hp', 'lotr', 'tolkien']
#     stopwords_list.append(names)

    no_stops_stemmed = [p_stemmer.stem(word) for word in words if word.lower() not in stopwords_list]
    
    return (' ').join(no_stops_stemmed)

In [5]:
def clean_text_lem(text):
    """Cleans text by keeping words only, tokenizing, lemmatizing and removing stopwords"""
    #Instantiate tokenizer and stemmer and lemmatizer
    re_tokenizer = RegexpTokenizer("\w+")
    lemmatizer = WordNetLemmatizer()
    p_stemmer = PorterStemmer()
        
    # Tokenze the text
    words = re_tokenizer.tokenize(text.lower())
    
    # Filter stop words
    stopwords_list = stopwords.words('english')
    
    # Adds names to stopwords_list
#     names = ['harry', 'potter', 'hp', 'lotr', 'tolkien']
#     stopwords_list.append(names)

    no_stops_lemmatized = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stopwords_list]
    
    return (' ').join(no_stops_lemmatized)

In [None]:
# remove special chars
df['title'] = df['title'].replace('[^\w ]','',regex=True).astype(str) 
# awesome regex shortcut thanks to https://stackoverflow.com/questions/1219915/regex-to-remove-apostrophe

In [3]:
def preprocess(df, text_col):
    
    df[text_col] = df[text_col].replace(['[removed]', '[deleted]'], np.nan)  # THis is done if bottom two commented out are important
    
#     df.insert(4, 'title_selftext', df['title'] + ' ' + df['selftext'])
#     df['title_selftext'].fillna(df['title'], inplace=True)
    
    df.dropna(subset=[text_col], inplace=True)
    df[text_col] = df[text_col].map(keep_words_only)
    df['text_length'] = df[text_col].map(len)
    df = df[df[text_col].map(len) > 10]
    df.drop_duplicates(subset=[text_col], inplace=True)
    
    df['clean_text_stem'] = df[text_col].map(clean_text_stem)
    df['clean_text_lem'] = df[text_col].map(clean_text_lem)
    
    df = df[df['clean_text_stem'].map(len) > 10]
    df = df[df['clean_text_lem'].map(len) > 10]