# WeRateDogs Data Wrangling

## Gather

In [71]:
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import time
import shutil

# API data stored in environment variables
consumer_key = %env TWITTER_CON_KEY
consumer_secret = %env TWITTER_CON_SECRET
access_token = %env TWITTER_ACC_TOKEN
access_secret = %env TWITTER_ACC_SECRET

In [72]:
tw_arch = pd.read_csv('data/twitter-archive-enhanced.csv')

In [73]:
# Flag to stop reloading
reload = False

tsv_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
if reload:
    with requests.get(tsv_url, stream=True) as r:
        with open('data/image_predictions.tsv', 'wb') as f:
            shutil.copyfileobj(r.raw, f)

In [74]:
tw_arch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [75]:
tw_arch.tweet_id.nunique() == len(tw_arch)

True

In [76]:
# Set up our client, then use tweet IDs from the archive to query the Twitter API
tweet_ids = tw_arch.tweet_id.tolist()
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [77]:
# Flag to stop reloading
reload = False

failed = []
passed = []
if reload:
    with open('data/tweet-json.txt', mode='w') as f:
        for tid in tweet_ids:
            try:
                data = api.get_status(tid, tweet_mode='extended')
            except:
                print("Failure to retrieve tweet with tid: {0}".format(tid))
                failed.append(tid)
                continue
            passed.append(tid)
            print(json.dumps(data._json), file=f)
            time.sleep(0.3)

In [78]:
twitter_json = []
with open('data/udacity.tweet-json.txt', 'r') as f:
    for line in f:
        twitter_json.append(json.loads(line))

### Twitter API
* [Endpoint](https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id)
* [Tweet](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object)
* [Entity](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/entities-object)
* [User](https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object)

In [79]:
twitter_records = []
skip_tweet = { 'entities', 'extended_entities' }
skip_user = { 'id', 'entities', 'profile_background_color', 'profile_background_image_url',
             'profile_background_image_url_https', 'profile_background_tile', 'profile_image_url',
             'profile_image_url_https', 'profile_banner_url', 'profile_link_color',
             'profile_sidebar_border_color', 'profile_sidebar_fill_color', 'profile_text_color',
             'profile_use_background_image', 'has_extended_profile', 'default_profile',
             'default_profile_image' }
for j in twitter_json:
    rec = {}
    for k, v in j.items():
        if k in skip_tweet:
            continue
        elif k == 'user':
            for m,n in v.items():
                if m in skip_user:
                    continue
                rec[ 'user_' + m ] = n
        else:
            rec[k] = v
    twitter_records.append(rec)

api_df = pd.DataFrame(twitter_records)
api_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 53 columns):
contributors                     0 non-null object
coordinates                      0 non-null object
created_at                       2354 non-null object
display_text_range               2354 non-null object
favorite_count                   2354 non-null int64
favorited                        2354 non-null bool
full_text                        2354 non-null object
geo                              0 non-null object
id                               2354 non-null int64
id_str                           2354 non-null object
in_reply_to_screen_name          78 non-null object
in_reply_to_status_id            78 non-null float64
in_reply_to_status_id_str        78 non-null object
in_reply_to_user_id              78 non-null float64
in_reply_to_user_id_str          78 non-null object
is_quote_status                  2354 non-null bool
lang                             2354 non-null objec


   # Assess

## Assessing Enhanced Twitter Archive

In [80]:
tw_arch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [81]:
tw_arch.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [82]:
tw_arch[ tw_arch.in_reply_to_status_id.notnull() ].head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
30,886267009285017600,8.862664e+17,2281182000.0,2017-07-15 16:51:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@NonWhiteHat @MayhewMayhem omg hello tanner yo...,,,,,12,10,,,,,
55,881633300179243008,8.81607e+17,47384430.0,2017-07-02 21:58:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@roushfenway These are good dogs but 17/10 is ...,,,,,17,10,,,,,
64,879674319642796034,8.795538e+17,3105441000.0,2017-06-27 12:14:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@RealKentMurphy 14/10 confirmed,,,,,14,10,,,,,
113,870726314365509632,8.707262e+17,16487760.0,2017-06-02 19:38:25 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@ComplicitOwl @ShopWeRateDogs &gt;10/10 is res...,,,,,10,10,,,,,
148,863427515083354112,8.634256e+17,77596200.0,2017-05-13 16:15:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@Jack_Septic_Eye I'd need a few more pics to p...,,,,,12,10,,,,,


In [83]:
type(tw_arch.iloc[1].timestamp)

str

In [84]:
tw_arch.source.value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64

In [85]:
tw_arch.sample(10).text.values.tolist()

["Meet Miley. She's a Scandinavian Hollabackgirl. Incalculably fluffy, unamused af. 11/10 would squeeze aggressively https://t.co/6r4GFZY5WS",
 "RT @dog_rates: HEY PUP WHAT'S THE PART OF THE HUMAN BODY THAT CONNECTS THE FOOT AND THE LEG? 11/10 so smart https://t.co/XQ1tRUmO3z",
 "This is Moose. He's rather h*ckin dangerous (you can tell by the collar). 11/10 would still attempt to snug https://t.co/lHVHGdDzb3",
 "RT @dog_rates: This is Carl. He's very powerful. 12/10 don't mess with Carl https://t.co/v5m2bIukXc",
 "This is Boots. She doesn't know what to do with treats so she just holds them. Very good girl. 12/10 would give more treats https://t.co/eAA8lratd3",
 "Say hello to Smiley. He's a blind therapy doggo having a h*ckin blast high steppin around in the snow. 14/10 would follow anywhere https://t.co/SHAb1wHjMz",
 "This is Tom. He's a silly dog. Known for his unconventional swing style. One h*ck of a sneaky tongue slip too. 11/10 would push https://t.co/6fSVcn9HAU",
 'Meet Tassy &

In [86]:
type(tw_arch[ tw_arch.retweeted_status_timestamp.notnull() ].iloc[0].retweeted_status_timestamp)

str

In [87]:
tw_arch[ tw_arch.retweeted_status_id.notnull() ].head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
19,888202515573088257,,,2017-07-21 01:02:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Canela. She attempted s...,8.87474e+17,4196984000.0,2017-07-19 00:47:34 +0000,https://twitter.com/dog_rates/status/887473957...,13,10,Canela,,,,
32,886054160059072513,,,2017-07-15 02:45:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @Athletics: 12/10 #BATP https://t.co/WxwJmv...,8.860537e+17,19607400.0,2017-07-15 02:44:07 +0000,https://twitter.com/dog_rates/status/886053434...,12,10,,,,,
36,885311592912609280,,,2017-07-13 01:35:06 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Lilly. She just paralle...,8.305833e+17,4196984000.0,2017-02-12 01:04:29 +0000,https://twitter.com/dog_rates/status/830583320...,13,10,Lilly,,,,
68,879130579576475649,,,2017-06-26 00:13:58 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Emmy. She was adopted t...,8.780576e+17,4196984000.0,2017-06-23 01:10:23 +0000,https://twitter.com/dog_rates/status/878057613...,14,10,Emmy,,,,
73,878404777348136964,,,2017-06-24 00:09:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Meet Shadow. In an attempt to r...,8.782815e+17,4196984000.0,2017-06-23 16:00:04 +0000,"https://www.gofundme.com/3yd6y1c,https://twitt...",13,10,Shadow,,,,


In [88]:
tw_arch.sample(10).expanded_urls.values.tolist()

['https://twitter.com/dog_rates/status/843235543001513987/photo/1,https://twitter.com/dog_rates/status/843235543001513987/photo/1,https://twitter.com/dog_rates/status/843235543001513987/photo/1',
 'https://twitter.com/dog_rates/status/667495797102141441/photo/1',
 'https://vine.co/v/inVtemLt9tE',
 'https://twitter.com/dog_rates/status/890006608113172480/photo/1,https://twitter.com/dog_rates/status/890006608113172480/photo/1',
 'https://twitter.com/dog_rates/status/734912297295085568/photo/1',
 'https://twitter.com/dog_rates/status/709207347839836162/photo/1',
 'https://twitter.com/dog_rates/status/693486665285931008/video/1',
 'https://twitter.com/dog_rates/status/678767140346941444/photo/1',
 'https://twitter.com/dog_rates/status/668154635664932864/photo/1',
 'https://twitter.com/dog_rates/status/684097758874210310/photo/1']

In [89]:
len(tw_arch[ tw_arch.expanded_urls.str.contains(',', na=False) ])

639

In [90]:
tw_arch.rating_numerator.value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

In [91]:
tw_arch.rating_denominator.value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

In [92]:
tw_arch[ tw_arch.rating_denominator > 10 ]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
342,832088576586297345,8.320875e+17,30582080.0,2017-02-16 04:45:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@docmisterio account started on 11/15/15,,,,,11,15,,,,,
433,820690176645140481,,,2017-01-15 17:52:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",The floofs have been released I repeat the flo...,,,,https://twitter.com/dog_rates/status/820690176...,84,70,,,,,
784,775096608509886464,,,2016-09-11 22:20:06 +0000,"<a href=""http://twitter.com/download/iphone"" r...","RT @dog_rates: After so many requests, this is...",7.403732e+17,4196984000.0,2016-06-08 02:41:38 +0000,https://twitter.com/dog_rates/status/740373189...,9,11,,,,,
902,758467244762497024,,,2016-07-28 01:00:57 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Why does this never happen at my front door......,,,,https://twitter.com/dog_rates/status/758467244...,165,150,,,,,
1068,740373189193256964,,,2016-06-08 02:41:38 +0000,"<a href=""http://twitter.com/download/iphone"" r...","After so many requests, this is Bretagne. She ...",,,,https://twitter.com/dog_rates/status/740373189...,9,11,,,,,
1120,731156023742988288,,,2016-05-13 16:15:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to this unbelievably well behaved sq...,,,,https://twitter.com/dog_rates/status/731156023...,204,170,this,,,,
1165,722974582966214656,,,2016-04-21 02:25:47 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy 4/20 from the squad! 13/10 for all https...,,,,https://twitter.com/dog_rates/status/722974582...,4,20,,,,,
1202,716439118184652801,,,2016-04-03 01:36:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bluebert. He just saw that both #Final...,,,,https://twitter.com/dog_rates/status/716439118...,50,50,Bluebert,,,,
1228,713900603437621249,,,2016-03-27 01:29:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy Saturday here's 9 puppers on a bench. 99...,,,,https://twitter.com/dog_rates/status/713900603...,99,90,,,,,
1254,710658690886586372,,,2016-03-18 02:46:49 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a brigade of puppers. All look very pre...,,,,https://twitter.com/dog_rates/status/710658690...,80,80,,,,,


In [93]:
tw_arch.name.value_counts()

None            745
a                55
Charlie          12
Cooper           11
Lucy             11
Oliver           11
Lola             10
Tucker           10
Penny            10
Winston           9
Bo                9
Sadie             8
the               8
Buddy             7
an                7
Daisy             7
Toby              7
Bailey            7
Koda              6
Jax               6
Dave              6
Jack              6
Bella             6
Rusty             6
Leo               6
Oscar             6
Scout             6
Milo              6
Stanley           6
Finn              5
               ... 
Ambrose           1
Pluto             1
Herb              1
Moofasa           1
Jiminus           1
Sprinkles         1
Crumpet           1
Doobert           1
Buddah            1
Tobi              1
Bauer             1
Rinna             1
Cleopatricia      1
Jareld            1
Gabby             1
Godzilla          1
Blipson           1
Snoop             1
Genevieve         1


In [94]:
tw_arch[ tw_arch.name.str.islower() ].name.value_counts()

a               55
the              8
an               7
very             5
one              4
just             4
quite            4
not              2
actually         2
getting          2
mad              2
infuriating      1
such             1
his              1
old              1
incredibly       1
unacceptable     1
all              1
life             1
my               1
officially       1
by               1
light            1
this             1
space            1
Name: name, dtype: int64

In [95]:
print('pupper', len(tw_arch[ ((tw_arch.pupper != 'None') & ((tw_arch.floofer != 'None') | (tw_arch.puppo != 'None') | (tw_arch.doggo != 'None'))) ]))
print('floofer', len(tw_arch[ ((tw_arch.floofer != 'None') & ((tw_arch.pupper != 'None') | (tw_arch.puppo != 'None') | (tw_arch.doggo != 'None'))) ]))
print('puppo', len(tw_arch[ ((tw_arch.puppo != 'None') & ((tw_arch.floofer != 'None') | (tw_arch.pupper != 'None') | (tw_arch.doggo != 'None'))) ]))
print('doggo', len(tw_arch[ ((tw_arch.doggo != 'None') & ((tw_arch.floofer != 'None') | (tw_arch.puppo != 'None') | (tw_arch.pupper != 'None'))) ]))

pupper 12
floofer 1
puppo 1
doggo 14


### Quality
* in_reply_to_status_id/user_id stored as floats, mostly NaN
* timestamp stored as a string instead of datetime
* source data is buried in HTML
* retweeted_status_id/user_id stored as floats
* retweeted_status_timestamp stored as string
* expanded_urls contains comma-separated values
* some extremely high/low values in rating_numerator/denominator
* some names are simple text instead of proper names
* 'None' string instead of NaN in pupper, doggo, floofer, puppo columns
* pupper, doggo, floofer, puppo columns are not mutually exclusive

### Tidyness
* "doggo", "floofer", "pupper", and "puppo" should be values for a single column
* rating_numerator and rating_denominator should be combined

## Assessing Twitter JSON

In [96]:
api_df.head(10)

Unnamed: 0,contributors,coordinates,created_at,display_text_range,favorite_count,favorited,full_text,geo,id,id_str,...,user_name,user_notifications,user_protected,user_screen_name,user_statuses_count,user_time_zone,user_translator_type,user_url,user_utc_offset,user_verified
0,,,Tue Aug 01 16:23:56 +0000 2017,"[0, 85]",39467,False,This is Phineas. He's a mystical boy. Only eve...,,892420643555336193,892420643555336193,...,WeRateDogs™ (author),False,False,dog_rates,5288,,none,https://t.co/N7sNNHAEXS,,True
1,,,Tue Aug 01 00:17:27 +0000 2017,"[0, 138]",33819,False,This is Tilly. She's just checking pup on you....,,892177421306343426,892177421306343426,...,WeRateDogs™ (author),False,False,dog_rates,5288,,none,https://t.co/N7sNNHAEXS,,True
2,,,Mon Jul 31 00:18:03 +0000 2017,"[0, 121]",25461,False,This is Archie. He is a rare Norwegian Pouncin...,,891815181378084864,891815181378084864,...,WeRateDogs™ (author),False,False,dog_rates,5288,,none,https://t.co/N7sNNHAEXS,,True
3,,,Sun Jul 30 15:58:51 +0000 2017,"[0, 79]",42908,False,This is Darla. She commenced a snooze mid meal...,,891689557279858688,891689557279858688,...,WeRateDogs™ (author),False,False,dog_rates,5288,,none,https://t.co/N7sNNHAEXS,,True
4,,,Sat Jul 29 16:00:24 +0000 2017,"[0, 138]",41048,False,This is Franklin. He would like you to stop ca...,,891327558926688256,891327558926688256,...,WeRateDogs™ (author),False,False,dog_rates,5288,,none,https://t.co/N7sNNHAEXS,,True
5,,,Sat Jul 29 00:08:17 +0000 2017,"[0, 138]",20562,False,Here we have a majestic great white breaching ...,,891087950875897856,891087950875897856,...,WeRateDogs™ (author),False,False,dog_rates,5288,,none,https://t.co/N7sNNHAEXS,,True
6,,,Fri Jul 28 16:27:12 +0000 2017,"[0, 140]",12041,False,Meet Jax. He enjoys ice cream so much he gets ...,,890971913173991426,890971913173991426,...,WeRateDogs™ (author),False,False,dog_rates,5288,,none,https://t.co/N7sNNHAEXS,,True
7,,,Fri Jul 28 00:22:40 +0000 2017,"[0, 118]",56848,False,When you watch your owner call another dog a g...,,890729181411237888,890729181411237888,...,WeRateDogs™ (author),False,False,dog_rates,5288,,none,https://t.co/N7sNNHAEXS,,True
8,,,Thu Jul 27 16:25:51 +0000 2017,"[0, 122]",28226,False,This is Zoey. She doesn't want to be one of th...,,890609185150312448,890609185150312448,...,WeRateDogs™ (author),False,False,dog_rates,5288,,none,https://t.co/N7sNNHAEXS,,True
9,,,Wed Jul 26 15:59:51 +0000 2017,"[0, 133]",32467,False,This is Cassie. She is a college pup. Studying...,,890240255349198849,890240255349198849,...,WeRateDogs™ (author),False,False,dog_rates,5288,,none,https://t.co/N7sNNHAEXS,,True


In [97]:
api_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 53 columns):
contributors                     0 non-null object
coordinates                      0 non-null object
created_at                       2354 non-null object
display_text_range               2354 non-null object
favorite_count                   2354 non-null int64
favorited                        2354 non-null bool
full_text                        2354 non-null object
geo                              0 non-null object
id                               2354 non-null int64
id_str                           2354 non-null object
in_reply_to_screen_name          78 non-null object
in_reply_to_status_id            78 non-null float64
in_reply_to_status_id_str        78 non-null object
in_reply_to_user_id              78 non-null float64
in_reply_to_user_id_str          78 non-null object
is_quote_status                  2354 non-null bool
lang                             2354 non-null objec

In [98]:
api_df.user_id_str.value_counts()

4196983835    2354
Name: user_id_str, dtype: int64

In [99]:
api_df.user_following.value_counts()

True    2354
Name: user_following, dtype: int64

In [100]:
api_df.favorite_count.value_counts()

0        179
610        3
345        3
2918       3
1691       3
2176       3
2768       3
1339       3
2706       3
522        2
3134       2
1618       2
250        2
2250       2
2660       2
2262       2
2305       2
1111       2
784        2
4878       2
346        2
14685      2
780        2
6923       2
6515       2
2433       2
3603       2
13518      2
3593       2
1536       2
        ... 
4681       1
523        1
559        1
802        1
527        1
27154      1
6676       1
535        1
537        1
6682       1
8731       1
23074      1
21029      1
667        1
6696       1
2608       1
35400      1
21041      1
4659       1
10804      1
4099       1
68152      1
10812      1
573        1
6718       1
33345      1
814        1
23108      1
2630       1
8143       1
Name: favorite_count, Length: 2007, dtype: int64

In [101]:
api_df.favorited.value_counts()

False    2346
True        8
Name: favorited, dtype: int64

In [102]:
api_df.retweet_count.value_counts()

1972     5
3652     5
83       5
146      4
61       4
748      4
2243     4
336      4
183      4
179      4
1207     4
265      4
115      4
71       4
1124     4
542      4
819      4
577      4
516      4
397      3
619      3
661      3
2511     3
261      3
431      3
482      3
403      3
557      3
572      3
576      3
        ..
2088     1
1271     1
2030     1
43       1
5365     1
4143     1
3316     1
1263     1
16439    1
2104     1
4125     1
27       1
4121     1
4119     1
4079     1
1285     1
10226    1
8183     1
2042     1
11524    1
6148     1
7        1
1281     1
2060     1
1825     1
8209     1
19       1
2068     1
30742    1
0        1
Name: retweet_count, Length: 1724, dtype: int64

In [103]:
api_df.retweeted.value_counts()

False    2354
Name: retweeted, dtype: int64

### Quality
* contributors, coordinates, geo, user_time_zone, user_utc_offset all empty; place with 1 value
* in_reply_to_status_id, in_reply_to_user_id, quoted_status_id all stored as floats
* user data all the same
* favorite_count and favorited do not match
* retweet_count and retweeted do not match

### Tidyness
* user data should be a separate table

## Assessing Image Prediction

In [104]:
img_prd = pd.read_csv('data/image_predictions.tsv', sep='\t')

In [105]:
img_prd.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [106]:
img_prd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [107]:
img_prd.tweet_id.nunique()

2075

In [108]:
img_prd.jpg_url.nunique()

2009

In [109]:
img_prd[ img_prd.jpg_url.duplicated(keep=False)].jpg_url.value_counts()

https://pbs.twimg.com/media/C4bTH6nWMAAX_bJ.jpg                                            2
https://pbs.twimg.com/media/CYLDikFWEAAIy1y.jpg                                            2
https://pbs.twimg.com/ext_tw_video_thumb/815965888126062592/pu/img/JleSw4wRhgKDWQj5.jpg    2
https://pbs.twimg.com/media/C12x-JTVIAAzdfl.jpg                                            2
https://pbs.twimg.com/media/Cq9guJ5WgAADfpF.jpg                                            2
https://pbs.twimg.com/media/C3nygbBWQAAjwcW.jpg                                            2
https://pbs.twimg.com/media/CZhn-QAWwAASQan.jpg                                            2
https://pbs.twimg.com/ext_tw_video_thumb/807106774843039744/pu/img/8XZg1xW35Xp2J6JW.jpg    2
https://pbs.twimg.com/media/CvJCabcWgAIoUxW.jpg                                            2
https://pbs.twimg.com/media/CsrjryzWgAAZY00.jpg                                            2
https://pbs.twimg.com/media/Ct72q9jWcAAhlnw.jpg                       

In [110]:
img_prd[ img_prd.jpg_url.duplicated(keep=False)].sort_values(by='jpg_url')

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
480,675354435921575936,https://pbs.twimg.com/ext_tw_video_thumb/67535...,1,upright,0.303415,False,golden_retriever,0.181351,True,Brittany_spaniel,0.162084,True
1297,752309394570878976,https://pbs.twimg.com/ext_tw_video_thumb/67535...,1,upright,0.303415,False,golden_retriever,0.181351,True,Brittany_spaniel,0.162084,True
1864,842892208864923648,https://pbs.twimg.com/ext_tw_video_thumb/80710...,1,Chihuahua,0.505370,True,Pomeranian,0.120358,True,toy_terrier,0.077008,True
1641,807106840509214720,https://pbs.twimg.com/ext_tw_video_thumb/80710...,1,Chihuahua,0.505370,True,Pomeranian,0.120358,True,toy_terrier,0.077008,True
1703,817181837579653120,https://pbs.twimg.com/ext_tw_video_thumb/81596...,1,Tibetan_mastiff,0.506312,True,Tibetan_terrier,0.295690,True,otterhound,0.036251,True
1691,815966073409433600,https://pbs.twimg.com/ext_tw_video_thumb/81596...,1,Tibetan_mastiff,0.506312,True,Tibetan_terrier,0.295690,True,otterhound,0.036251,True
1705,817423860136083457,https://pbs.twimg.com/ext_tw_video_thumb/81742...,1,ice_bear,0.336200,False,Samoyed,0.201358,True,Eskimo_dog,0.186789,True
1858,841833993020538882,https://pbs.twimg.com/ext_tw_video_thumb/81742...,1,ice_bear,0.336200,False,Samoyed,0.201358,True,Eskimo_dog,0.186789,True
1715,819004803107983360,https://pbs.twimg.com/media/C12whDoVEAALRxa.jpg,1,standard_poodle,0.351308,True,toy_poodle,0.271929,True,Tibetan_terrier,0.094759,True
1718,819015337530290176,https://pbs.twimg.com/media/C12whDoVEAALRxa.jpg,1,standard_poodle,0.351308,True,toy_poodle,0.271929,True,Tibetan_terrier,0.094759,True


In [111]:
img_prd.img_num.value_counts()

1    1780
2     198
3      66
4      31
Name: img_num, dtype: int64

In [112]:
img_prd[ (img_prd.p1_dog == False) & (img_prd.p2_dog == False) & (img_prd.p3_dog == False) ]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,4.588540e-02,False,terrapin,1.788530e-02,False
17,666104133288665088,https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg,1,hen,0.965932,False,cock,3.391940e-02,False,partridge,5.206580e-05,False
18,666268910803644416,https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg,1,desktop_computer,0.086502,False,desk,8.554740e-02,False,bookcase,7.947970e-02,False
21,666293911632134144,https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg,1,three-toed_sloth,0.914671,False,otter,1.525000e-02,False,great_grey_owl,1.320720e-02,False
25,666362758909284353,https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg,1,guinea_pig,0.996496,False,skunk,2.402450e-03,False,hamster,4.608630e-04,False
29,666411507551481857,https://pbs.twimg.com/media/CT-RugiWIAELEaq.jpg,1,coho,0.404640,False,barracouta,2.714850e-01,False,gar,1.899450e-01,False
45,666786068205871104,https://pbs.twimg.com/media/CUDmZIkWcAAIPPe.jpg,1,snail,0.999888,False,slug,5.514170e-05,False,acorn,2.625800e-05,False
50,666837028449972224,https://pbs.twimg.com/media/CUEUva1WsAA2jPb.jpg,1,triceratops,0.442113,False,armadillo,1.140710e-01,False,common_iguana,4.325530e-02,False
51,666983947667116034,https://pbs.twimg.com/media/CUGaXDhW4AY9JUH.jpg,1,swab,0.589446,False,chain_saw,1.901420e-01,False,wig,3.450970e-02,False
53,667012601033924608,https://pbs.twimg.com/media/CUG0bC0U8AAw2su.jpg,1,hyena,0.987230,False,African_hunting_dog,1.260080e-02,False,coyote,5.735010e-05,False


In [113]:
img_prd[ (img_prd.p1_conf + img_prd.p2_conf + img_prd.p3_conf > 1) ]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
106,667866724293877760,https://pbs.twimg.com/media/CUS9PlUWwAANeAD.jpg,1,jigsaw_puzzle,1.0,False,prayer_rug,1.0113e-08,False,doormat,1.74017e-10,False


In [114]:
img_prd[ (img_prd.p1_conf >= 1) | (img_prd.p2_conf >= 1) | (img_prd.p3_conf >= 1) ]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
106,667866724293877760,https://pbs.twimg.com/media/CUS9PlUWwAANeAD.jpg,1,jigsaw_puzzle,1.0,False,prayer_rug,1.0113e-08,False,doormat,1.74017e-10,False


### Quality
* 66 repeated jpg_urls with different tweet ids
* some rows have only false predictions
* 1 row has 100% confidence for false prediction

### Tidyness
* Each row should correspond with a single prediction

## Clean

In [115]:
tw_copy = tw_arch.copy()

#### Define
Replace 'None' values with NaN in "doggo", "floofer", "pupper", "puppo" columns

#### Code

In [116]:
tw_copy.pupper = tw_copy.pupper.replace('None', np.nan)
tw_copy.floofer = tw_copy.floofer.replace('None', np.nan)
tw_copy.puppo = tw_copy.puppo.replace('None', np.nan)
tw_copy.doggo = tw_copy.doggo.replace('None', np.nan)

#### Test

In [117]:
print(tw_copy.pupper.unique())
print(tw_copy.floofer.unique())
print(tw_copy.puppo.unique())
print(tw_copy.doggo.unique())

[nan 'pupper']
[nan 'floofer']
[nan 'puppo']
[nan 'doggo']


#### Define
Reduce the "doggo", "floofer", "puuper" and "puppo" columns into a sigle "dog_stage" column. Drop extra columns.

#### Code

In [118]:
# https://stackoverflow.com/questions/48517405/chaining-multiple-combine-first
from functools import reduce

dog_stages = [ tw_copy['pupper'], tw_copy['puppo'], tw_copy['doggo'], tw_copy['floofer'] ]
tw_copy['dog_stage'] = reduce(lambda acc, stage: acc.combine_first(stage), dog_stages )
tw_copy = tw_copy.drop(columns=['pupper', 'floofer', 'puppo', 'doggo'])

#### Test

In [119]:
print('pupper', len(tw_arch[ tw_arch.pupper != 'None']))
print('pupper', len(tw_arch[ tw_arch.doggo != 'None']))
print('pupper', len(tw_arch[ tw_arch.puppo != 'None']))
print('pupper', len(tw_arch[ tw_arch.floofer != 'None']))

pupper 257
pupper 97
pupper 30
pupper 10


In [120]:
tw_copy.dog_stage.value_counts()

pupper     257
doggo       84
puppo       30
floofer      9
Name: dog_stage, dtype: int64

#### Define
Convert timestamp and retweeted_status_timestamp into datetime objects. Add columns with better names,
and drop old columns.

#### Code

In [121]:
tw_copy['tweet_time'] = pd.to_datetime(tw_copy['timestamp'])
tw_copy['retweeted_time'] = pd.to_datetime(tw_copy['retweeted_status_timestamp'])
tw_copy = tw_copy.drop(columns=['timestamp', 'retweeted_status_timestamp'])

#### Test

In [122]:
tw_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 14 columns):
tweet_id                    2356 non-null int64
in_reply_to_status_id       78 non-null float64
in_reply_to_user_id         78 non-null float64
source                      2356 non-null object
text                        2356 non-null object
retweeted_status_id         181 non-null float64
retweeted_status_user_id    181 non-null float64
expanded_urls               2297 non-null object
rating_numerator            2356 non-null int64
rating_denominator          2356 non-null int64
name                        2356 non-null object
dog_stage                   380 non-null object
tweet_time                  2356 non-null datetime64[ns, UTC]
retweeted_time              181 non-null datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](2), float64(4), int64(3), object(5)
memory usage: 257.8+ KB


#### Define
Map source HTML to readable values

#### Code

In [123]:
sources = {
    '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>': 'iphone',
    '<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>': 'vine',
    '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>': 'twitter_web',
    '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>': 'tweetdeck'      
}

tw_copy.source = tw_copy.source.map(sources, na_action='ignore')

#### Test

In [124]:
tw_copy.source.value_counts()

iphone         2221
vine             91
twitter_web      33
tweetdeck        11
Name: source, dtype: int64

#### Define
Replace lowercase, 'None' names with NaN value

#### Code

In [125]:
tw_copy.name.replace(regex=r'^[a-z].*$', value='None', inplace=True)
tw_copy.name.replace('None', np.nan, inplace=True)

#### Test

In [126]:
tw_copy.name.value_counts()

Charlie         12
Oliver          11
Lucy            11
Cooper          11
Penny           10
Lola            10
Tucker          10
Bo               9
Winston          9
Sadie            8
Bailey           7
Toby             7
Daisy            7
Buddy            7
Koda             6
Bella            6
Stanley          6
Leo              6
Rusty            6
Jax              6
Oscar            6
Milo             6
Scout            6
Jack             6
Dave             6
Sunny            5
Phil             5
Oakley           5
Louis            5
Alfie            5
                ..
Herb             1
Moofasa          1
Nida             1
Bruno            1
Aja              1
Stormy           1
Sprinkles        1
Tobi             1
Ebby             1
Bauer            1
Rinna            1
Cleopatricia     1
Jareld           1
Gabby            1
Godzilla         1
Blipson          1
Snoop            1
Genevieve        1
Monster          1
Crumpet          1
Derby            1
Ralphson    

In [127]:
len(tw_copy[ tw_copy.name.str.contains(r'^[a-z].*$', regex=True, na=False) ])

0

#### Define
Add new column value of rating_numerator/rating_denominator.

#### Code

In [157]:
tw_copy['rating_score'] = tw_copy.rating_numerator / tw_copy.rating_denominator
tw_copy.loc[~np.isfinite(tw_copy['rating_score']), 'rating_score'] = np.nan

#### Test

In [158]:
tw_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 15 columns):
tweet_id                    2356 non-null int64
in_reply_to_status_id       78 non-null Int64
in_reply_to_user_id         78 non-null Int64
source                      2356 non-null object
text                        2356 non-null object
retweeted_status_id         181 non-null Int64
retweeted_status_user_id    181 non-null Int64
expanded_urls               2297 non-null object
rating_numerator            2356 non-null int64
rating_denominator          2356 non-null int64
name                        1502 non-null object
dog_stage                   380 non-null object
tweet_time                  2356 non-null datetime64[ns, UTC]
retweeted_time              181 non-null datetime64[ns, UTC]
rating_score                2355 non-null float64
dtypes: Int64(4), datetime64[ns, UTC](2), float64(1), int64(3), object(5)
memory usage: 285.4+ KB


In [159]:
tw_copy.rating_score.value_counts()

1.200000      562
1.100000      469
1.000000      463
1.300000      351
0.900000      157
0.800000      102
0.700000       54
1.400000       54
0.500000       38
0.600000       32
0.300000       19
0.400000       15
0.200000       11
0.100000        8
42.000000       2
0.818182        2
1.500000        2
0.000000        2
7.500000        2
1.250000        1
66.600000       1
2.600000        1
177.600000      1
0.636364        1
18.200000       1
0.733333        1
1.700000        1
3.428571        1
2.700000        1
Name: rating_score, dtype: int64

#### Define
Convert retweeted_status_id, retweeted_status_user_id, in_reply_to_status_id, in_reply_to_user_id from float to int

#### Code

In [160]:
tw_copy.retweeted_status_id = tw_copy.retweeted_status_id.astype('Int64')
tw_copy.retweeted_status_user_id = tw_copy.retweeted_status_user_id.astype('Int64')
tw_copy.in_reply_to_status_id = tw_copy.in_reply_to_status_id.astype('Int64')
tw_copy.in_reply_to_user_id = tw_copy.in_reply_to_user_id.astype('Int64')

#### Test

In [161]:
tw_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 15 columns):
tweet_id                    2356 non-null int64
in_reply_to_status_id       78 non-null Int64
in_reply_to_user_id         78 non-null Int64
source                      2356 non-null object
text                        2356 non-null object
retweeted_status_id         181 non-null Int64
retweeted_status_user_id    181 non-null Int64
expanded_urls               2297 non-null object
rating_numerator            2356 non-null int64
rating_denominator          2356 non-null int64
name                        1502 non-null object
dog_stage                   380 non-null object
tweet_time                  2356 non-null datetime64[ns, UTC]
retweeted_time              181 non-null datetime64[ns, UTC]
rating_score                2355 non-null float64
dtypes: Int64(4), datetime64[ns, UTC](2), float64(1), int64(3), object(5)
memory usage: 285.4+ KB


#### Define
Rather than cleaning the image prediction data, grab only the first (most confident) prediction,
and join to enhanced data.

Copy image prediction into new data, filter non-dog predictions, drop dog/not-dog column, and merge
with working dataframe.

#### Code

In [162]:
first_pred = img_prd[ [ 'tweet_id', 'p1', 'p1_conf', 'p1_dog']].copy()
first_pred = first_pred[ first_pred.p1_dog == True ]
first_pred = first_pred.drop(columns=['p1_dog'])
tw_mrg1 = pd.merge(tw_copy, first_pred, on='tweet_id', how='outer')

#### Test

In [163]:
tw_mrg1.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,source,text,retweeted_status_id,retweeted_status_user_id,expanded_urls,rating_numerator,rating_denominator,name,dog_stage,tweet_time,retweeted_time,rating_score,p1,p1_conf
0,892420643555336193,,,iphone,This is Phineas. He's a mystical boy. Only eve...,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,2017-08-01 16:23:56+00:00,NaT,1.3,,
1,892177421306343426,,,iphone,This is Tilly. She's just checking pup on you....,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,2017-08-01 00:17:27+00:00,NaT,1.3,Chihuahua,0.323581
2,891815181378084864,,,iphone,This is Archie. He is a rare Norwegian Pouncin...,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,2017-07-31 00:18:03+00:00,NaT,1.2,Chihuahua,0.716012
3,891689557279858688,,,iphone,This is Darla. She commenced a snooze mid meal...,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,2017-07-30 15:58:51+00:00,NaT,1.3,,
4,891327558926688256,,,iphone,This is Franklin. He would like you to stop ca...,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,2017-07-29 16:00:24+00:00,NaT,1.2,basset,0.555712


#### Define
Rather than cleaning Twitter API data, extract only the favorite_count and retweet_count data,
and merge with working dataframe.

Copy relevant columns into new dataframe, rename id column for joining. Merge, recast joined
columns from floats to ints.

#### Code

In [164]:
api_counts = api_df[ [ 'id', 'favorite_count', 'retweet_count' ]].copy()
api_counts.rename(columns={'id':'tweet_id'}, inplace=True)
tw_merged = pd.merge(tw_mrg1, api_counts, on='tweet_id', how='outer')
tw_merged.favorite_count = tw_merged.favorite_count.astype('Int64')
tw_merged.retweet_count = tw_merged.retweet_count.astype('Int64')

#### Test

In [165]:
tw_merged.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,source,text,retweeted_status_id,retweeted_status_user_id,expanded_urls,rating_numerator,rating_denominator,name,dog_stage,tweet_time,retweeted_time,rating_score,p1,p1_conf,favorite_count,retweet_count
0,892420643555336193,,,iphone,This is Phineas. He's a mystical boy. Only eve...,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,2017-08-01 16:23:56+00:00,NaT,1.3,,,39467,8853
1,892177421306343426,,,iphone,This is Tilly. She's just checking pup on you....,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,2017-08-01 00:17:27+00:00,NaT,1.3,Chihuahua,0.323581,33819,6514
2,891815181378084864,,,iphone,This is Archie. He is a rare Norwegian Pouncin...,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,2017-07-31 00:18:03+00:00,NaT,1.2,Chihuahua,0.716012,25461,4328
3,891689557279858688,,,iphone,This is Darla. She commenced a snooze mid meal...,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,2017-07-30 15:58:51+00:00,NaT,1.3,,,42908,8964
4,891327558926688256,,,iphone,This is Franklin. He would like you to stop ca...,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,2017-07-29 16:00:24+00:00,NaT,1.2,basset,0.555712,41048,9774


#### Define
Rename and reorder columns for easier analysis. Drop columns we don't plan on using right now.

#### Code

In [166]:
tw_merged = tw_merged.drop(columns='expanded_urls')
col_names = {
    'source': 'tweet_source',
    'text': 'tweet_text',
    'p1': 'predicted_breed',
    'p1_conf': 'breed_conf',
    'name': 'dog_name',
    'rating_numerator': 'dog_numer',
    'rating_denominator': 'dog_denom',
    'rating_score': 'dog_rating',
}
tw_merged.rename(columns=col_names, inplace=True)

col_order = [ 'tweet_id', 'tweet_time', 'tweet_text', 'tweet_source',
            'favorite_count', 'retweet_count', 'in_reply_to_status_id', 'in_reply_to_user_id',
            'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_time',
            'dog_name', 'dog_stage', 'dog_numer', 'dog_denom', 'dog_rating',
            'predicted_breed', 'breed_conf' ]
tw_merged = tw_merged.reindex(columns=col_order)

#### Test

In [167]:
tw_merged.head()

Unnamed: 0,tweet_id,tweet_time,tweet_text,tweet_source,favorite_count,retweet_count,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,retweeted_time,dog_name,dog_stage,dog_numer,dog_denom,dog_rating,predicted_breed,breed_conf
0,892420643555336193,2017-08-01 16:23:56+00:00,This is Phineas. He's a mystical boy. Only eve...,iphone,39467,8853,,,,,NaT,Phineas,,13,10,1.3,,
1,892177421306343426,2017-08-01 00:17:27+00:00,This is Tilly. She's just checking pup on you....,iphone,33819,6514,,,,,NaT,Tilly,,13,10,1.3,Chihuahua,0.323581
2,891815181378084864,2017-07-31 00:18:03+00:00,This is Archie. He is a rare Norwegian Pouncin...,iphone,25461,4328,,,,,NaT,Archie,,12,10,1.2,Chihuahua,0.716012
3,891689557279858688,2017-07-30 15:58:51+00:00,This is Darla. She commenced a snooze mid meal...,iphone,42908,8964,,,,,NaT,Darla,,13,10,1.3,,
4,891327558926688256,2017-07-29 16:00:24+00:00,This is Franklin. He would like you to stop ca...,iphone,41048,9774,,,,,NaT,Franklin,,12,10,1.2,basset,0.555712


In [168]:
tw_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 18 columns):
tweet_id                    2356 non-null int64
tweet_time                  2356 non-null datetime64[ns, UTC]
tweet_text                  2356 non-null object
tweet_source                2356 non-null object
favorite_count              2354 non-null Int64
retweet_count               2354 non-null Int64
in_reply_to_status_id       78 non-null Int64
in_reply_to_user_id         78 non-null Int64
retweeted_status_id         181 non-null Int64
retweeted_status_user_id    181 non-null Int64
retweeted_time              181 non-null datetime64[ns, UTC]
dog_name                    1502 non-null object
dog_stage                   380 non-null object
dog_numer                   2356 non-null int64
dog_denom                   2356 non-null int64
dog_rating                  2355 non-null float64
predicted_breed             1532 non-null object
breed_conf                  1532 non-null float64


### Output cleaned data

In [169]:
tw_merged.to_csv('data/tw_clean.csv', index=False)