In [21]:
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
df = pd.read_csv('data/twitter_unique.csv',lineterminator='\n')
df.pop('Unnamed: 0')
df.head()

Unnamed: 0,text,retweet_count
0,Displaced dog jumped into my jeep. Please shar...,9193
1,Water is seeping into the studio from Buffalo ...,16
2,OPEN SHELTER: North Shore 9th Grade Center - ...,36
3,This dog is walking around Sinton TX carrying ...,8626
4,Please remember: #Harvey is still an active st...,673


In [4]:
def process_tweet(tweet):
    st_1 = []
    for w in tweet.split():
        #remove retweet annotation if present:
        if w == 'RT' or w[0] == '@':
            continue
        #remove hashtag symbol
        elif w[0] == '#':
            st_1.append(w[1:])
        #replace link with LINK keyword
        elif w[:4] == 'http':
            st_1.append('link')
        else:
            st_1.append(w)
    
    st_2 =  word_tokenize(' '.join(st_1))
    
    #remove stop words and punctuation, make everything lowercase
    st_3 = [w.lower() for w in st_2 if w.isalpha() and not w in stop_words]
    
    #lemmatization (converts all words to root form for standardization)
    lem = WordNetLemmatizer()
    st_4 = list(map(lambda x: lem.lemmatize(x, pos='v'), st_3))
    return st_4

doc_sample = df['text'][0]
process_tweet(doc_sample)

['displace',
 'dog',
 'jump',
 'jeep',
 'please',
 'share',
 'help',
 'find',
 'owner',
 'harvey',
 'hurricane',
 'displacedpets',
 'link']

In [5]:
#for backwards-compatibility (Python 3.7 has str.isascii)
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

tweets = df['text']
processed_tweets = tweets.map(process_tweet)
processed_tweets.to_csv('data/twitter_text_processed.csv')
processed_tweets.head()

0    [displace, dog, jump, jeep, please, share, hel...
1    [water, seep, studio, buffalo, bayou, about, m...
2    [open, shelter, north, shore, grade, center, h...
3    [this, dog, walk, around, sinton, tx, carry, e...
4    [please, remember, harvey, still, active, stor...
Name: text, dtype: object

In [11]:
proc = pd.DataFrame(data=processed_tweets)
proc.to_csv('data/twitter_processed.csv')