In [1]:
import re
import emoji
import random
import numpy as np
import pandas as pd

import nltk
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, TweetTokenizer
from nltk.corpus import wordnet, stopwords

import warnings

warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", DeprecationWarning)

In [4]:
en_train = pd.read_csv(r"C:\Users\samsu\Desktop\Fall20\CS 695-002\Project\Datasets\Final_Data\English.csv")

In [18]:
def preprocess(df):
    
    #removes URL
    pattern = r'https.?://[^\s]+[\s]?'
    df["processed"] = df["tweet"].str.replace(pat=pattern, repl="", regex=True)
    
    #removes usernames/mentions
    pattern = r'@[^\s]+'
    df["processed"] = df["processed"].str.replace(pat=pattern, repl="", regex=True)
    
    #removes emoji and smiley
    pattern = re.compile("["
                         u"\U0001F600-\U0001F64F"
                         u"\U0001F300-\U0001F5FF"
                         u"\U0001F680-\U0001F6FF"
                         u"\U0001F1E0-\U0001F1FF"
                         u"\U00002500-\U00002BEF"
                         u"\U00002702-\U000027B0"
                         u"\U00002702-\U000027B0"
                         u"\U000024C2-\U0001F251"
                         u"\U0001f926-\U0001f937"
                         u"\U00010000-\U0010ffff"
                         u"\u2640-\u2642"
                         u"\u2600-\u2B55"
                         u"\u200d"
                         u"\u23cf"
                         u"\u23e9"
                         u"\u231a"
                         u"\ufe0f"
                         u"\u3030"
                         "]+", flags=re.UNICODE)
    df["processed"] = df["processed"].str.replace(pat=pattern, repl="", regex=True)
    
    #removes numbers
    pattern = r'\d+'
    df["processed"] = df["processed"].str.replace(pat=pattern, repl="", regex=True)
    
    #removes punctuation
    pattern = r"[^\w\s]"
    df["processed"] = df["processed"].str.replace(pat=pattern, repl=" ", regex=True)
    
    #converts to lower case
    #df["processed"] = df["processed"].str.lower()

    #removes stop words
    stop_words = stopwords.words("english")    
    remove_stop_words = lambda row: " ".join([token for token in row.split(" ")
                                              if token not in stop_words])
    df["processed"] = df["processed"].apply(remove_stop_words)
    
    #removes extra spaces
    pattern = r"[\s]+"
    df["processed"] = df["processed"].str.replace(pat=pattern, repl=" ", regex=True)
    
    #extract root words
    lemmatizer = nltk.stem.WordNetLemmatizer()
    w_tokenizer = TweetTokenizer()
    def lemmatize_text(text):
         return [(lemmatizer.lemmatize(w)) for w in w_tokenizer.tokenize((text))]
    
    df["processed"] = df["processed"].apply(lambda row: lemmatize_text(row))
    
    return(df)

In [19]:
%%time
preprocess(en_train)

Wall time: 7.77 s


Unnamed: 0,tweet,label,processed
0,"As of March 13th , 2014 , the booklet had been...",0,"[As, March, th, booklet, downloaded, time, cou..."
1,In order to help increase the booklets downloa...,0,"[In, order, help, increase, booklet, downloads..."
2,( Simply copy and paste the following text int...,0,"[Simply, copy, paste, following, text, YouTube..."
3,Click below for a FREE download of a colorfull...,1,"[Click, FREE, download, colorfully, illustrate..."
4,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,0,"[Click, DOWNLOAD, MB, green, banner, link]"
...,...,...,...
67326,RT @JakeM_1998: RT BillSpindle: It's all about...,0,"[RT, RT, BillSpindle, It, power, top, fighter,..."
67327,RT @ThinkAgain_DOS: Iraq: #ISIS sets off 21 ca...,0,"[RT, Iraq, ISIS, set, car, bomb, Anbar, usual,..."
67328,RT @ThePatriot143: DEAR STATE DEPARTMENT: WHER...,0,"[RT, DEAR, STATE, DEPARTMENT, WHERE, IS, HILLA..."
67329,"""@panelrific: Let's go 🐧🐧🐧🐧🐧🐧😃""",0,"[Let, go]"


In [20]:
print(en_train.tweet[199])
print(en_train.processed[199])

We are more concerned about ZOG controlling US and Russia .
['We', 'concerned', 'ZOG', 'controlling', 'US', 'Russia']


In [1]:
#en_train[en_train['tweet'].astype(str).str.contains('US')]

In [None]:
resources = ["wordnet", "stopwords", "punkt", "averaged_perceptron_tagger", "maxent_treebank_pos_tagger"]

    for resource in resources:
        try:
            nltk.data.find("tokenizers/" + resource)
        except LookupError:
            nltk.download(resource)
    
    #create Lemmatizer object
    lemma = WordNetLemmatizer()
    
    def lemmatize_word(tagged_token):
        root = []
        for token in tagged_token:
            tag = token[1][0]
            word = token[0]
            if tag.startswith('J'):
                root.append(lemma.lemmatize(word, wordnet.ADJ))
            elif tag.startswith('V'):
                root.append(lemma.lemmatize(word, wordnet.VERB))
            elif tag.startswith('N'):
                root.append(lemma.lemmatize(word, wordnet.NOUN))
            elif tag.startswith('R'):
                root.append(lemma.lemmatize(word, wordnet.ADV))
            else:          
                root.append(word)
        return root
    
    def lemmatize_doc(document):
        lemmatized_list = []
        tokenized_sent = sent_tokenize(document)
        for sentence in tokenized_sent:
            no_punctuation = re.sub(r"[`'\",.!?()]", " ", sentence)
            tokenized_word = word_tokenize(no_punctuation)
            tagged_token = pos_tag(tokenized_word)
            lemmatized = lemmatize_word(tagged_token)
            lemmatized_list.extend(lemmatized)
        return " ".join(lemmatized_list)
    
    #apply the functions
    df["processed"] = df["processed"].apply(lambda row: lemmatize_doc(row))