In [23]:
# Created By Rupesh Basnet
# Testing out some pre-existing resources for pre-processing the tweets for WITBragDay as a framework for other 
# past events
# coding: utf-8

In [1]:
import csv
import nltk
import pandas as pd

In [2]:
data = pd.read_csv("WITBragDay.csv", sep=',', error_bad_lines=False, index_col=False, dtype='unicode')

In [3]:
# Viewing the top 5 data
data.head(5)
data.columns

Index([u'coordinates', u'created_at', u'hashtags', u'media', u'urls',
       u'favorite_count', u'id', u'in_reply_to_screen_name',
       u'in_reply_to_status_id', u'in_reply_to_user_id', u'lang', u'place',
       u'possibly_sensitive', u'retweet_count', u'reweet_id',
       u'retweet_screen_name', u'source', u'text', u'tweet_url',
       u'user_created_at', u'user_screen_name', u'user_default_profile_image',
       u'user_description', u'user_favourites_count', u'user_followers_count',
       u'user_friends_count', u'user_listed_count', u'user_location',
       u'user_name', u'user_screen_name.1', u'user_statuses_count',
       u'user_time_zone', u'user_urls', u'user_verified'],
      dtype='object')

In [4]:
data.head(5)

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name.1,user_statuses_count,user_time_zone,user_urls,user_verified
0,,Thu Aug 17 23:46:00 +0000 2017,WITBragDay,,,0,898330098876661760,,,,...,5277,1006,509,San Francisco,Carina C. Zona,cczona,69411,,http://patreon.com/cczona,False
1,,Sun Aug 13 08:31:30 +0000 2017,,,,0,896650407942008832,,,,...,168,341,2,,lowkey☠️,pahareya,12512,,,False
2,,Thu Aug 17 22:12:33 +0000 2017,,,,0,898306582819160064,,,,...,296,740,7,"London, England",ash,ashiesdollparts,18869,,https://Instagram.com/ashiesdollparts,False
3,,Thu Aug 17 23:23:29 +0000 2017,WITBragDay,,,0,898324433001103361,,,,...,1499,607,83,,Peter Barfuss 𒀱,bofh453,55620,,,False
4,,Thu Aug 17 21:52:20 +0000 2017,,,,0,898301492423081984,,,,...,3331,1729,60,"London, England",Kimberly M,kimpmurrell,7582,,,False


In [5]:
non_retweet = data[data['retweet_screen_name'].isnull()]

In [45]:
non_retweet.shape

(4409, 34)

In [43]:
data.shape
# It looks like alot of the tweets were retweets which would distort the data

(29512, 34)

In [47]:
clean_data = non_retweet.filter(items = ["created_at","hashtags","text","favorite_count","lang","place","geo","user_location","retweet_count", "retweet_screen_name"])

In [50]:
clean_data.head(5)

Unnamed: 0,created_at,hashtags,text,favorite_count,lang,place,user_location,retweet_count,retweet_screen_name
8,Sun Aug 13 08:32:29 +0000 2017,women WITBragDay,3 degrees in building industry and they reject...,9,en,,"Wroclaw, Poland",2,
13,Thu Aug 17 23:34:25 +0000 2017,WITBragDay,@azimman @LaunchDarkly sounds like you should ...,1,en,,/usr/local/sin,0,
16,Thu Aug 17 22:15:48 +0000 2017,WITBragDay,My coding knowledge makes a difference and imp...,6,en,,"Zionsville, IN",2,
44,Thu Aug 17 23:25:01 +0000 2017,WITBragDay,Built a ECS JSON-generating container definiti...,3,en,,"Wellington, New Zealand",2,
46,Thu Aug 17 23:22:06 +0000 2017,WITBragDay,Recruiters should definitely be scrolling thro...,203,en,,/usr/local/sin,76,


In [51]:
df = clean_data

In [52]:
df.columns

Index([u'created_at', u'hashtags', u'text', u'favorite_count', u'lang', u'place', u'user_location', u'retweet_count', u'retweet_screen_name'], dtype='object')

In [53]:
df.dtypes

created_at             object
hashtags               object
text                   object
favorite_count         object
lang                   object
place                  object
user_location          object
retweet_count          object
retweet_screen_name    object
dtype: object

In [54]:
# We will have to convert the object created_at into a datetime for now
test = pd.to_datetime('Thu Aug 17 23:46:00 +0000 2017', format='%a %b %d %H:%M:%S +0000 %Y')
# Still kept the original created_at date for later modification
df['date'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')

In [55]:
df.dtypes

created_at                     object
hashtags                       object
text                           object
favorite_count                 object
lang                           object
place                          object
user_location                  object
retweet_count                  object
retweet_screen_name            object
date                   datetime64[ns]
dtype: object

In [56]:
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [57]:
df.head(5)

Unnamed: 0,created_at,hashtags,text,favorite_count,lang,place,user_location,retweet_count,retweet_screen_name,date
8,Sun Aug 13 08:32:29 +0000 2017,women WITBragDay,3 degrees in building industry and they reject...,9,en,,"Wroclaw, Poland",2,,2017-08-13 08:32:29
13,Thu Aug 17 23:34:25 +0000 2017,WITBragDay,@azimman @LaunchDarkly sounds like you should ...,1,en,,/usr/local/sin,0,,2017-08-17 23:34:25
16,Thu Aug 17 22:15:48 +0000 2017,WITBragDay,My coding knowledge makes a difference and imp...,6,en,,"Zionsville, IN",2,,2017-08-17 22:15:48
44,Thu Aug 17 23:25:01 +0000 2017,WITBragDay,Built a ECS JSON-generating container definiti...,3,en,,"Wellington, New Zealand",2,,2017-08-17 23:25:01
46,Thu Aug 17 23:22:06 +0000 2017,WITBragDay,Recruiters should definitely be scrolling thro...,203,en,,/usr/local/sin,76,,2017-08-17 23:22:06


In [58]:
# Changing the settings of pd to view all the tweet's text
pd.set_option('display.max_colwidth', -1)

In [59]:
df.head(5).text

8     3 degrees in building industry and they rejected me because #women. IT industry welcomed me with open arms. Not a single regret #WITBragDay
13    @azimman @LaunchDarkly sounds like you should look at the #WITBragDay tag :P                                                               
16    My coding knowledge makes a difference and improves the lives of my coworkers and our customers. And it's the coolest feeling. #WITBragDay 
44    Built a ECS JSON-generating container definition module entirely in Terraform, because why would you not. #WITBragDay                      
46    Recruiters should definitely be scrolling through #WITBragDay for their lists. Tons of awesome talent in there.                            
Name: text, dtype: object

In [60]:
df.to_csv('WITBrag_Clean_Noretweets.csv')

In [56]:
df = pd.read_csv("WITBrag_Clean_Noretweets.csv")

In [50]:
#testing tokenization
from nltk.tokenize import word_tokenize
test_tweet = 'RT @alicegoldfuss: So happy to hear women getting recruiter pings after #WITBragDay :D'
word_tokenize(test_tweet)
# Note is punkt is not installed just use the terminal to download it 

['RT',
 '@',
 'alicegoldfuss',
 ':',
 'So',
 'happy',
 'to',
 'hear',
 'women',
 'getting',
 'recruiter',
 'pings',
 'after',
 '#',
 'WITBragDay',
 ':',
 'D']

In [90]:
# Emojis and mentions and hashtags are being tokenized
# Taken from a blog https://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/
import re

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

def remove_emoji(data):
    if not data:
        return data
    if not isinstance(data, basestring):
        return data
    try:
    # UCS-4
        patt = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
    except re.error:
    # UCS-2
        patt = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])')
    return patt.sub('', data)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    s = s.decode('unicode-escape').encode('latin1').decode('utf8')
    data = remove_emoji(s)
    tokens = tokenize(data)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

test_tweet = 'RT @alicegoldfuss: So happy to hear women getting recruiter pings after #WITBragDay :D'
print (preprocess(test_tweet))

[u'RT', u'@alicegoldfuss', u':', u'So', u'happy', u'to', u'hear', u'women', u'getting', u'recruiter', u'pings', u'after', u'#WITBragDay', u':D']


In [91]:
test = df.head(20).text

In [92]:
test[7]

'\xf0\x9f\x91\x8d\xf0\x9f\x8f\xbb\nRT @matthewbretten: @ColetteWeston @drjessicabarker Definitely a #WITBragDay'

In [93]:
preprocess(test[7])

[u'RT',
 u'@matthewbretten',
 u':',
 u'@ColetteWeston',
 u'@drjessicabarker',
 u'Definitely',
 u'a',
 u'#WITBragDay']

In [52]:
# Removing all stop words
from nltk.corpus import stopwords
import string

punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']

In [53]:
import operator 
from collections import Counter

terms_all = [term for term in preprocess('RT @alicegoldfuss: So happy to hear women getting recruiter pings after #WITBragDay :D')]
terms_stop = [term for term in preprocess('RT @alicegoldfuss: So happy to hear women getting recruiter pings after #WITBragDay :D') if term not in stop]

In [11]:
terms_all

['RT',
 '@alicegoldfuss',
 ':',
 'So',
 'happy',
 'to',
 'hear',
 'women',
 'getting',
 'recruiter',
 'pings',
 'after',
 '#WITBragDay',
 ':D']

In [12]:
terms_stop

['RT',
 '@alicegoldfuss',
 'So',
 'happy',
 'hear',
 'women',
 'getting',
 'recruiter',
 'pings',
 '#WITBragDay',
 ':D']

In [94]:
count_all = Counter()
for index, row in df.iterrows():     
    terms_all = [term for term in preprocess(row['text'])]
    # Update the counter
    count_all.update(terms_all)
print(count_all.most_common(5))

[(u'.', 2977), (u'#WITBragDay', 2530), (u'!', 1537), (u',', 1434), (u'to', 1373)]


In [95]:
count_all = Counter()
for index, row in df.iterrows():     
    terms_stop = [term for term in preprocess(row['text']) if term not in stop and not term.startswith(r"\x")]
    # Update the counter
    count_all.update(terms_stop)
print(count_all.most_common(5))

[(u'#WITBragDay', 2530), (u'I', 1289), (u'women', 358), (u'tech', 346), (u'amp', 330)]


In [96]:
count_all.most_common(20)

[(u'#WITBragDay', 2530),
 (u'I', 1289),
 (u'women', 358),
 (u'tech', 346),
 (u'amp', 330),
 (u'#WiTBragDay', 235),
 (u"I'm", 219),
 (u'The', 200),
 (u'code', 179),
 (u'amazing', 173),
 (u'This', 170),
 (u'hashtag', 169),
 (u'tweets', 163),
 (u'\ufe0f', 159),
 (u'work', 156),
 (u'one', 151),
 (u'awesome', 147),
 (u'first', 143),
 (u'many', 128),
 (u'Thank', 124)]

In [103]:
count_all_terms_only_once = Counter()
for index, row in df.iterrows():     
    terms_all = [term for term in preprocess(row['text'])]
    terms_single = set(terms_all)
    # Update the counter
    count_all_terms_only_once.update(terms_single)
print(count_all_terms_only_once.most_common(5))

[(u'#WITBragDay', 2523), (u'.', 1817), (u'to', 1133), (u'!', 1120), (u'the', 1083)]


In [99]:
# Only look at the hashtags
count_all_hash = Counter()
for index, row in df.iterrows():     
    terms_hash = [term for term in preprocess(row['text']) 
              if term.startswith('#')]
    # Update the counter
    count_all_hash.update(terms_hash)
print(count_all_hash.most_common(5))

[(u'#WITBragDay', 2530), (u'#WiTBragDay', 235), (u'#witbragday', 119), (u'#womenintech', 104), (u'#WITBragday', 72)]


In [101]:
count_all_term_only = Counter()
for index, row in df.iterrows():     
    terms_only = [term for term in preprocess(row['text']) 
              if term not in stop and
              not term.startswith(('#', '@'))] 
    # Update the counter
    count_all_term_only.update(terms_only)
print(count_all_term_only.most_common(5))

[(u'I', 1289), (u'women', 358), (u'tech', 346), (u'amp', 330), (u"I'm", 219)]


In [104]:
from nltk import bigrams 
 
terms_bigram = bigrams(terms_stop)

In [110]:
count_all_bigram = Counter()
for index, row in df.iterrows():     
    terms_stop = [term for term in preprocess(row['text']) if term not in stop]
    # Update the counter
    terms_bigram = bigrams(terms_stop)
    count_all_bigram.update(terms_bigram)
print(count_all_bigram.most_common(5))

[((u'women', u'tech'), 82), ((u'#WITBragDay', u'I'), 82), ((u'#WITBragDay', u'tweets'), 77), ((u'I', u'love'), 51), ((u'The', u'latest'), 50)]


In [111]:
count_all_bigram.most_common(20)

[((u'women', u'tech'), 82),
 ((u'#WITBragDay', u'I'), 82),
 ((u'#WITBragDay', u'tweets'), 77),
 ((u'I', u'love'), 51),
 ((u'The', u'latest'), 50),
 ((u'So', u'many'), 43),
 ((u'Women', u'Tech'), 38),
 ((u'\u200d', u'\u200d'), 38),
 ((u'1', u'st'), 37),
 ((u'#WITBragDay', u'hashtag'), 37),
 ((u'I', u'work'), 36),
 ((u'I', u'know'), 35),
 ((u'amazing', u'women'), 33),
 ((u'I', u'wrote'), 32),
 ((u'hashtag', u'#WITBragDay'), 31),
 ((u'\ufe0f', u'\ufe0f'), 29),
 ((u'tech', u'#WITBragDay'), 28),
 ((u'Check', u'#WITBragDay'), 27),
 ((u'reading', u'#WITBragDay'), 26),
 ((u'#WITBragDay', u'The'), 25)]