In [1]:
import numpy as np
import pandas as pd

The purpose of this notebook is to take the pruned dataframe that only contains tweets and convert it into 
a more compressed format that only contains unique tweets.

In [4]:
df = pd.read_csv('data/twitter_text.csv',lineterminator='\n')
print(df.shape)
df['text'][0]

(7041866, 3)


'RT @aaronjayjack: Displaced dog jumped into my jeep. Please share to help find owner! #harvey #hurricane #displacedpets https://t.co/0C6Ve9…'

Our next step is to create a set containing all "seen" tweets so that as we loop through all the tweets, we only keep rows with tweets that haven't been seen before. The final size of the data should be much smaller.

In [26]:
#helper function that gets rid of the retweet annotation if present
def process_tweet(tweet):
    #get rid of the retweet annotation if present
    if tweet[:2] == 'RT':
        index = tweet.find(':')
        if index == -1:
            #print('weird format...')
            #print(tweet[:50])
            
            #no colon, so just get rid of the first word after RT
            words = tweet.split()
            return ' '.join(words[2:])
            
        return tweet[tweet.index(':') + 2:]
    else:
        return tweet

#keeps track of which tweets have been seen
seen_tweets = set()

#data to create new pruned dataframe
tweets = []
retweet_count = []

index = 0
for row in df.iterrows():
    tweet = process_tweet(row[1]['text'])
    if tweet == None:
        break
    count = row[1]['retweet_count']
    
    if tweet not in seen_tweets:
        seen_tweets.add(tweet)
        tweets.append(tweet)
        retweet_count.append(count)
    
    index += 1
    if index % 500000 == 0:
        print(index)

print('number of unique tweets: ' + str(len(seen_tweets)))

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000


In [30]:
#now create a new dataframe with only the unique tweets and their associated retweet counts
df_pruned = pd.DataFrame({'text':tweets, 'retweet_count':retweet_count})
print(df_pruned.shape)
df_pruned.head()

(1299428, 2)


Unnamed: 0,text,retweet_count
0,Displaced dog jumped into my jeep. Please shar...,9193
1,Water is seeping into the studio from Buffalo ...,16
2,OPEN SHELTER: North Shore 9th Grade Center - ...,36
3,This dog is walking around Sinton TX carrying ...,8626
4,Please remember: #Harvey is still an active st...,673


In [31]:
#now just write to a file for future use
df_pruned.to_csv('data/twitter_unique.csv')