# Gather

#### Download the file 'twitter-archive-enhanced.csv' from the link [twitter_archive_enhanced.csv](https://d17h27t6h515a5.cloudfront.net/topher/2017/August/59a4e958_twitter-archive-enhanced/twitter-archive-enhanced.csv)

In [1]:
import pandas as pd

In [2]:
# Load data frame with twitter-archive-enhanced.csv
tweet_archive_data = pd.read_csv('twitter-archive-enhanced.csv')
tweet_archive_data.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,


#### Download the image_predictions.tsv programmatically from this [url](https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv)

In [3]:
import requests
import os

In [4]:
# download the file from the below url
url ='https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
response

<Response [200]>

In [5]:
with open(url.split('/')[-1], mode='wb') as file:

    file.write(response.content)

In [6]:
os.listdir(os.curdir)

['.DS_Store',
 '.git',
 '.ipynb_checkpoints',
 'act_report.html',
 'act_report.ipynb',
 'image-predictions.tsv',
 'README.md',
 'tweet_archive_data_clean.csv',
 'tweet_count_data_clean.csv',
 'tweet_image_data_clean.csv',
 'tweet_json.txt',
 'twitter-archive-enhanced.csv',
 'twitter_archive_master.csv',
 'wrangle_act.html',
 'wrangle_act.ipynb']

In [7]:
# Load dataframe with image-predictions.tsv 
tweet_image_data = pd.read_csv('image-predictions.tsv', sep = '\t')
tweet_image_data.head(2)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True


#### Gather tweet and retweet counts  for the tweet ids in the archive dataset

In [8]:
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# Keys are hidden to comply with Twitter's API terms and conditions
# Two rerun this replace the hidden with your own keys after subscribing to twitter developer account
import tweepy

consumer_key = 'Hidden-Key'
consumer_secret = 'Hidden-Key'
access_token = 'Hidden-Key'
access_secret = 'Hidden-Key'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

In [9]:
import json
from timeit import default_timer as timer

In [10]:
# Tweet ids in the twitter-archive-enhanced.csv file
tweet_ids = tweet_archive_data.tweet_id.values
len(tweet_ids)

2356

In [11]:
# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        # print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            # print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            # print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

1891.9851507429994
{888202515573088257: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 873697596434513921: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 872668790621863937: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 869988702071779329: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 866816280283807744: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 861769973181624320: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 845459076796616705: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 842892208864923648: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 837012587749474308: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 827228250799742977: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 812747805718642688: TweepError([{'code':

In [12]:
# how many twitter ids the data retreival failed
len(fails_dict)

16

In [12]:
# read json data from tweet_json.txt and load into a dataframe
tweet_count_list = []
with open('tweet_json.txt') as json_file:
    for line in json_file:
        json_data = json.loads(line)
        tweet_id = json_data['id']
        retweet_count = json_data['retweet_count']
        favorite_count = json_data['favorite_count']
        tweet_count_list.append({'tweet_id': tweet_id,
                        'retweet_count': int(retweet_count),
                        'favorite_count': int(favorite_count)
                        })



tweet_count_data = pd.DataFrame(tweet_count_list, columns = ['tweet_id', 'retweet_count', 'favorite_count'])
        

In [13]:
tweet_count_data.head(2)

Unnamed: 0,tweet_id,retweet_count,favorite_count
0,892420643555336193,8304,37995
1,892177421306343426,6136,32620


# Assess

In [14]:
tweet_archive_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

##### tweet_archive_data has 17 columns and 2356 observations, tweet_id seems to be the unique Id
##### in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id , retweeted_status_user_id, retweeted_status_timestamp columns has very few non null values.

##### timestamp is not represented as date.

##### Lets see if there are any duplicates

In [15]:
tweet_archive_data[tweet_archive_data.tweet_id.duplicated()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [16]:
tweet_archive_data.tweet_id.value_counts

<bound method IndexOpsMixin.value_counts of 0       892420643555336193
1       892177421306343426
2       891815181378084864
3       891689557279858688
4       891327558926688256
5       891087950875897856
6       890971913173991426
7       890729181411237888
8       890609185150312448
9       890240255349198849
10      890006608113172480
11      889880896479866881
12      889665388333682689
13      889638837579907072
14      889531135344209921
15      889278841981685760
16      888917238123831296
17      888804989199671297
18      888554962724278272
19      888202515573088257
20      888078434458587136
21      887705289381826560
22      887517139158093824
23      887473957103951883
24      887343217045368832
25      887101392804085760
26      886983233522544640
27      886736880519319552
28      886680336477933568
29      886366144734445568
               ...        
2326    666411507551481857
2327    666407126856765440
2328    666396247373291520
2329    666373753744588802
2330    666

##### tweet_id is the unique id and there are 2356 unique tweet_ids for 2356 observations in the tweet_archive_data 

In [17]:
tweet_archive_data.head(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [18]:
tweet_archive_data.tail()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2351,666049248165822465,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a 1949 1st generation vulpix. Enj...,,,,https://twitter.com/dog_rates/status/666049248...,5,10,,,,,
2352,666044226329800704,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a purebred Piers Morgan. Loves to Netf...,,,,https://twitter.com/dog_rates/status/666044226...,6,10,a,,,,
2353,666033412701032449,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,,
2354,666029285002620928,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,,
2355,666020888022790149,,,2015-11-15 22:32:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a Japanese Irish Setter. Lost eye...,,,,https://twitter.com/dog_rates/status/666020888...,8,10,,,,,


#####  Upon just visual observation we can see there are NaN values in in_reply_to_status_id , in_reply_to_user_id , retweeted_status_id, retweeted_status_user_id and retweeted_status_timestamp.
##### Lets take a closer look at these NaN values

In [19]:
tweet_archive_data.isnull().sum()

tweet_id                         0
in_reply_to_status_id         2278
in_reply_to_user_id           2278
timestamp                        0
source                           0
text                             0
retweeted_status_id           2175
retweeted_status_user_id      2175
retweeted_status_timestamp    2175
expanded_urls                   59
rating_numerator                 0
rating_denominator               0
name                             0
doggo                            0
floofer                          0
pupper                           0
puppo                            0
dtype: int64

##### in_reply_to_status_id  has 2278 NaN
##### in_reply_to_user_id has 2278 NaN
##### retweeted_status_id has 2175 NaN
##### retweeted_status_user_id 2175 NaN
##### retweeted_status_timestamp 2175 NaN
##### Since these columns have majority values as Null these columns can be deleted from the data frame.


In [21]:
tweet_archive_data.name.sample(2)

3       Darla
1132     None
Name: name, dtype: object

In [22]:
tweet_archive_data.name.value_counts()

None            745
a                55
Charlie          12
Lucy             11
Cooper           11
Oliver           11
Tucker           10
Lola             10
Penny            10
Bo                9
Winston           9
the               8
Sadie             8
Daisy             7
Bailey            7
Toby              7
Buddy             7
an                7
Jack              6
Jax               6
Oscar             6
Leo               6
Koda              6
Dave              6
Scout             6
Rusty             6
Milo              6
Bella             6
Stanley           6
Oakley            5
               ... 
Cecil             1
Yukon             1
Ziva              1
Sonny             1
Enchilada         1
Burt              1
Asher             1
Jazz              1
Strudel           1
Taco              1
Lance             1
Brutus            1
Alexanderson      1
Cannon            1
Pupcasso          1
Farfle            1
Boston            1
Kallie            1
Ike               1


##### Some of the values in the name column are "None","the","such" ,"a" and "an". These dont look like real names of the dogs. Majority of the rows ( 745) have None populated. Does this mean these rows dont have a Dog name available and data is entered as None.

##### All the valid values for the name are starting with Uppercase letter.
##### All the invalid values may be starting with lowercase
##### lets take a closer look at the invalid name values

In [23]:
tweet_archive_data.loc[tweet_archive_data['name'].str.islower()].name.value_counts()

a               55
the              8
an               7
very             5
quite            4
just             4
one              4
getting          2
mad              2
actually         2
not              2
unacceptable     1
light            1
all              1
such             1
officially       1
life             1
space            1
incredibly       1
this             1
by               1
my               1
infuriating      1
old              1
his              1
Name: name, dtype: int64

In [24]:
tweet_archive_data.loc[tweet_archive_data['name'].str.islower()].name.count()

109

##### There are 109 rows that have invalid values for name
##### There are 745 rows that have name value as None


In [25]:
tweet_archive_data.source.value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64

In [26]:
tweet_archive_data.text.nunique

<bound method IndexOpsMixin.nunique of 0       This is Phineas. He's a mystical boy. Only eve...
1       This is Tilly. She's just checking pup on you....
2       This is Archie. He is a rare Norwegian Pouncin...
3       This is Darla. She commenced a snooze mid meal...
4       This is Franklin. He would like you to stop ca...
5       Here we have a majestic great white breaching ...
6       Meet Jax. He enjoys ice cream so much he gets ...
7       When you watch your owner call another dog a g...
8       This is Zoey. She doesn't want to be one of th...
9       This is Cassie. She is a college pup. Studying...
10      This is Koda. He is a South Australian decksha...
11      This is Bruno. He is a service shark. Only get...
12      Here's a puppo that seems to be on the fence a...
13      This is Ted. He does his best. Sometimes that'...
14      This is Stuart. He's sporting his favorite fan...
15      This is Oliver. You're witnessing one of his m...
16      This is Jim. He found a f

In [27]:
tweet_archive_data[['name','text']]

Unnamed: 0,name,text
0,Phineas,This is Phineas. He's a mystical boy. Only eve...
1,Tilly,This is Tilly. She's just checking pup on you....
2,Archie,This is Archie. He is a rare Norwegian Pouncin...
3,Darla,This is Darla. She commenced a snooze mid meal...
4,Franklin,This is Franklin. He would like you to stop ca...
5,,Here we have a majestic great white breaching ...
6,Jax,Meet Jax. He enjoys ice cream so much he gets ...
7,,When you watch your owner call another dog a g...
8,Zoey,This is Zoey. She doesn't want to be one of th...
9,Cassie,This is Cassie. She is a college pup. Studying...


##### It looks like name is extracted from the text value.

In [28]:
tweet_archive_data.expanded_urls.sample()

2002    https://twitter.com/dog_rates/status/672481316...
Name: expanded_urls, dtype: object

In [29]:
tweet_archive_data.rating_denominator.value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

In [30]:
tweet_archive_data.query("rating_denominator != 10")

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
313,835246439529840640,8.35246e+17,26259580.0,2017-02-24 21:54:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@jonnysun @Lin_Manuel ok jomny I know you're e...,,,,,960,0,,,,,
342,832088576586297345,8.320875e+17,30582080.0,2017-02-16 04:45:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@docmisterio account started on 11/15/15,,,,,11,15,,,,,
433,820690176645140481,,,2017-01-15 17:52:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",The floofs have been released I repeat the flo...,,,,https://twitter.com/dog_rates/status/820690176...,84,70,,,,,
516,810984652412424192,,,2016-12-19 23:06:23 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Sam. She smiles 24/7 &amp; secretly aspir...,,,,"https://www.gofundme.com/sams-smile,https://tw...",24,7,Sam,,,,
784,775096608509886464,,,2016-09-11 22:20:06 +0000,"<a href=""http://twitter.com/download/iphone"" r...","RT @dog_rates: After so many requests, this is...",7.403732e+17,4196984000.0,2016-06-08 02:41:38 +0000,https://twitter.com/dog_rates/status/740373189...,9,11,,,,,
902,758467244762497024,,,2016-07-28 01:00:57 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Why does this never happen at my front door......,,,,https://twitter.com/dog_rates/status/758467244...,165,150,,,,,
1068,740373189193256964,,,2016-06-08 02:41:38 +0000,"<a href=""http://twitter.com/download/iphone"" r...","After so many requests, this is Bretagne. She ...",,,,https://twitter.com/dog_rates/status/740373189...,9,11,,,,,
1120,731156023742988288,,,2016-05-13 16:15:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to this unbelievably well behaved sq...,,,,https://twitter.com/dog_rates/status/731156023...,204,170,this,,,,
1165,722974582966214656,,,2016-04-21 02:25:47 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy 4/20 from the squad! 13/10 for all https...,,,,https://twitter.com/dog_rates/status/722974582...,4,20,,,,,
1202,716439118184652801,,,2016-04-03 01:36:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bluebert. He just saw that both #Final...,,,,https://twitter.com/dog_rates/status/716439118...,50,50,Bluebert,,,,


##### There are  23 rows where rating denominator is not equal to 10

##### Lets take a closer look at the rating numerator

In [31]:
tweet_archive_data.rating_numerator.value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

##### These ratings almost always have a denominator of 10. The numerators, though? Almost always greater than 10. 11/10, 12/10, 13/10, etc. Why? Because ["they're good dogs Brent."](https://knowyourmeme.com/memes/theyre-good-dogs-brent) WeRateDogs has over 4 million followers and has received international media coverage.

In [32]:
tweet_archive_data.query("rating_numerator > 14").sort_values('rating_numerator')

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
285,838916489579200512,,,2017-03-07 00:57:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @KibaDva: I collected all the good dogs!! 1...,8.38906e+17,811740800.0,2017-03-07 00:15:46 +0000,https://twitter.com/KibaDva/status/83890598062...,15,10,,,,,
291,838085839343206401,8.380855e+17,2894131000.0,2017-03-04 17:56:49 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@bragg6of8 @Andy_Pace_ we are still looking fo...,,,,,15,10,,,,,
55,881633300179243008,8.81607e+17,47384430.0,2017-07-02 21:58:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@roushfenway These are good dogs but 17/10 is ...,,,,,17,10,,,,,
1663,682808988178739200,6.827884e+17,4196984000.0,2016-01-01 06:22:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...","I'm aware that I could've said 20/16, but here...",,,,,20,16,,,,,
516,810984652412424192,,,2016-12-19 23:06:23 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Sam. She smiles 24/7 &amp; secretly aspir...,,,,"https://www.gofundme.com/sams-smile,https://tw...",24,7,Sam,,,,
1712,680494726643068929,,,2015-12-25 21:06:00 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have uncovered an entire battalion of ...,,,,https://twitter.com/dog_rates/status/680494726...,26,10,,,,,
763,778027034220126208,,,2016-09-20 00:24:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Sophie. She's a Jubilant Bush Pupper. ...,,,,https://twitter.com/dog_rates/status/778027034...,27,10,Sophie,,,pupper,
1433,697463031882764288,,,2016-02-10 16:51:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy Wednesday here's a bucket of pups. 44/40...,,,,https://twitter.com/dog_rates/status/697463031...,44,40,,,,,
1274,709198395643068416,,,2016-03-14 02:04:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...","From left to right:\nCletus, Jerome, Alejandro...",,,,https://twitter.com/dog_rates/status/709198395...,45,50,,,,,
1202,716439118184652801,,,2016-04-03 01:36:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bluebert. He just saw that both #Final...,,,,https://twitter.com/dog_rates/status/716439118...,50,50,Bluebert,,,,


##### There are 28 rows with ratings numerator greater than 14 ..... are these exceeding the ratings for "They are good dog brent" these may be erroneous.

##### Lets take a look at the rows where ratings numerator is greater than 14 and rating denominator is not equal to 10

In [33]:
tweet_archive_data.query("rating_numerator > 14 & rating_denominator != 10").sort_values('rating_numerator')

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1663,682808988178739200,6.827884e+17,4196984000.0,2016-01-01 06:22:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...","I'm aware that I could've said 20/16, but here...",,,,,20,16,,,,,
516,810984652412424192,,,2016-12-19 23:06:23 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Sam. She smiles 24/7 &amp; secretly aspir...,,,,"https://www.gofundme.com/sams-smile,https://tw...",24,7,Sam,,,,
1433,697463031882764288,,,2016-02-10 16:51:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy Wednesday here's a bucket of pups. 44/40...,,,,https://twitter.com/dog_rates/status/697463031...,44,40,,,,,
1274,709198395643068416,,,2016-03-14 02:04:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...","From left to right:\nCletus, Jerome, Alejandro...",,,,https://twitter.com/dog_rates/status/709198395...,45,50,,,,,
1202,716439118184652801,,,2016-04-03 01:36:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bluebert. He just saw that both #Final...,,,,https://twitter.com/dog_rates/status/716439118...,50,50,Bluebert,,,,
1351,704054845121142784,,,2016-02-28 21:25:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a whole flock of puppers. 60/50 I'll ...,,,,https://twitter.com/dog_rates/status/704054845...,60,50,a,,,,
1254,710658690886586372,,,2016-03-18 02:46:49 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a brigade of puppers. All look very pre...,,,,https://twitter.com/dog_rates/status/710658690...,80,80,,,,,
433,820690176645140481,,,2017-01-15 17:52:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",The floofs have been released I repeat the flo...,,,,https://twitter.com/dog_rates/status/820690176...,84,70,,,,,
1843,675853064436391936,,,2015-12-13 01:41:41 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have an entire platoon of puppers. Tot...,,,,https://twitter.com/dog_rates/status/675853064...,88,80,,,,,
1228,713900603437621249,,,2016-03-27 01:29:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy Saturday here's 9 puppers on a bench. 99...,,,,https://twitter.com/dog_rates/status/713900603...,99,90,,,,,


#####  There are 16 records that have ratings numerator greater than 14 and ratings denominator not equal to 10
##### These records definitely look errorneous as the ratings do not make any sense

##### Lets take a look at the last four columns of tweet_archive_data dataframe

In [34]:
tweet_archive_data[['doggo','floofer','pupper','puppo']].head(10)

Unnamed: 0,doggo,floofer,pupper,puppo
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,doggo,,,


##### Looks like these four columns represent data for dog breed , which can be represented in one column called dog_breed

In [35]:
tweet_image_data.head(5)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [36]:
tweet_image_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


###### There are 12 columns and 2075 observations
###### Column names are not very descriptive.
######  None of the columns have NaN values

###### lets see if there are any duplicate tweet_ids

In [37]:
tweet_image_data[tweet_image_data.tweet_id.duplicated()]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog


###### There are no duplicate tweet_ids so these are all unique tweets or retweets may be
###### Lets see if there are duplicates in image urls, i.e if a image is used my multiple tweets.

In [38]:
tweet_image_data[tweet_image_data.jpg_url.duplicated(keep=False)].sort_values('jpg_url')

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
480,675354435921575936,https://pbs.twimg.com/ext_tw_video_thumb/67535...,1,upright,0.303415,False,golden_retriever,0.181351,True,Brittany_spaniel,0.162084,True
1297,752309394570878976,https://pbs.twimg.com/ext_tw_video_thumb/67535...,1,upright,0.303415,False,golden_retriever,0.181351,True,Brittany_spaniel,0.162084,True
1864,842892208864923648,https://pbs.twimg.com/ext_tw_video_thumb/80710...,1,Chihuahua,0.505370,True,Pomeranian,0.120358,True,toy_terrier,0.077008,True
1641,807106840509214720,https://pbs.twimg.com/ext_tw_video_thumb/80710...,1,Chihuahua,0.505370,True,Pomeranian,0.120358,True,toy_terrier,0.077008,True
1703,817181837579653120,https://pbs.twimg.com/ext_tw_video_thumb/81596...,1,Tibetan_mastiff,0.506312,True,Tibetan_terrier,0.295690,True,otterhound,0.036251,True
1691,815966073409433600,https://pbs.twimg.com/ext_tw_video_thumb/81596...,1,Tibetan_mastiff,0.506312,True,Tibetan_terrier,0.295690,True,otterhound,0.036251,True
1705,817423860136083457,https://pbs.twimg.com/ext_tw_video_thumb/81742...,1,ice_bear,0.336200,False,Samoyed,0.201358,True,Eskimo_dog,0.186789,True
1858,841833993020538882,https://pbs.twimg.com/ext_tw_video_thumb/81742...,1,ice_bear,0.336200,False,Samoyed,0.201358,True,Eskimo_dog,0.186789,True
1715,819004803107983360,https://pbs.twimg.com/media/C12whDoVEAALRxa.jpg,1,standard_poodle,0.351308,True,toy_poodle,0.271929,True,Tibetan_terrier,0.094759,True
1718,819015337530290176,https://pbs.twimg.com/media/C12whDoVEAALRxa.jpg,1,standard_poodle,0.351308,True,toy_poodle,0.271929,True,Tibetan_terrier,0.094759,True


###### Interesting to see image url is shared by more than one tweet, what does this mean?
###### Since image predictions data is created for the twitter ids in tweet_archive_data, lets take a look at the tweet archive data for a sample of these tweets sharing the same image url.

In [39]:
tweet_archive_data.query("tweet_id in (675354435921575936,752309394570878976)")

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
949,752309394570878976,,,2016-07-11 01:11:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Everyone needs to watch this. 1...,6.753544e+17,4196984000.0,2015-12-11 16:40:19 +0000,https://twitter.com/dog_rates/status/675354435...,13,10,,,,,
1865,675354435921575936,,,2015-12-11 16:40:19 +0000,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Everyone needs to watch this. 13/10 https://t....,,,,https://twitter.com/dog_rates/status/675354435...,13,10,,,,,


In [40]:
tweet_archive_data.query("tweet_id in (711694788429553666,761371037149827077)")

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
872,761371037149827077,,,2016-08-05 01:19:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Oh. My. God. 13/10 magical af h...,7.116948e+17,4196984000.0,2016-03-20 23:23:54 +0000,https://twitter.com/dog_rates/status/711694788...,13,10,,,,,
1246,711694788429553666,,,2016-03-20 23:23:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Oh. My. God. 13/10 magical af https://t.co/Ezu...,,,,https://twitter.com/dog_rates/status/711694788...,13,10,,,,,


###### If you look at the retweeted_status_id , it holds the original tweet_id if it is a retweet. This is one way to recognize retweets. 

In [41]:
tweet_count_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2340 entries, 0 to 2339
Data columns (total 3 columns):
tweet_id          2340 non-null int64
retweet_count     2340 non-null int64
favorite_count    2340 non-null int64
dtypes: int64(3)
memory usage: 54.9 KB


###### There are 2340 observations and three columns. There are no NaN values

### Quality

#### twitter-archive-enhanced.csv

###### 1. in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id , retweeted_status_user_id, retweeted_status_timestamp columns have majority null values.
###### 2. retweeted_status_id  is populated for retweets. To prepare a simple dataset to join with tweet counts lets remove the retweets.
###### 3. in_reply_to_status_id is populated for tweet replies.Lets remove these tweet replies.
###### 4. name column has majority of the rows ( 745) have None populated.There are also 109 rows that have invalid values for name with values like ( 'the','a','an','such' etc).
###### 5. Invalid values in ratings numerator and rating denominator. There are 16 records that have ratings numerator greater than 14 and ratings denominator not equal to 10.
###### 6. timestamp column is not in date type.
###### 7. When one of the columns in columns doggo, floofer, pupper, puppo is populated, other columns are populated with a value "None" . This should be represented as a Null value.
###### 8. Source column data is not very useful for the analysis. Delete this column.

#### image-predictions.tsv

###### 9. column names must be renamed to something more descriptive about the data it holds
###### 10. Some breed name values populated in p1_dog are starting with upper case and some are starting with lower case letter.


### Tidiness 

#### twitter-archive-enhanced.csv

###### 1. Dog breed value can be restructured to be stored in one column instead of four different columns.

#### image-predictions.tsv

###### 2.  Re structure  or drop the columns that are unnecessary. such as img_num,p2, p2_conf,p2_dog,p3,p3_conf,p3_dog 

###### 3. twitter_archive_data has 2356 rows and twitter_image_data has 2075 rows. After removing the retweets we need to merge the data in these datasets with tweet_counts to have the data in single data frame



# Clean

#### Define
1. Select all the rows that are retweets
2. Delete all rows that are retweets
3. select the rows that are tweet replies
4. Delete the rows that are tweet replies
3. Delete columns in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id , retweeted_status_user_id, retweeted_status_timestamp after removing the retweets

#### Code

In [42]:
# Create a copy of the data frame
# we do not want to loose the original data set
tweet_archive_data_clean = tweet_archive_data.copy()


In [43]:
tweet_archive_data_clean.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,


In [44]:
# creating boolean series True for retweeted_status_id isnull 
# original tweets have retweeted_status_id as null
retweet_is_null = pd.isnull(tweet_archive_data_clean['retweeted_status_id'])
tweet_archive_data_clean = tweet_archive_data_clean[retweet_is_null]

In [45]:
# creating boolean series True for reply_status_id isnull 
# original tweets have retweeted_status_id as null
reply_is_null = pd.isnull(tweet_archive_data_clean['in_reply_to_status_id'])
tweet_archive_data_clean = tweet_archive_data_clean[reply_is_null]

In [46]:
# Drop columns in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,
# retweeted_status_user_id, retweeted_status_timestamp 

tweet_archive_data_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id',
                            'retweeted_status_id','retweeted_status_user_id',
                            'retweeted_status_timestamp'],inplace = True, axis=1)

#### Test

In [47]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id              2097 non-null int64
timestamp             2097 non-null object
source                2097 non-null object
text                  2097 non-null object
expanded_urls         2094 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  2097 non-null object
doggo                 2097 non-null object
floofer               2097 non-null object
pupper                2097 non-null object
puppo                 2097 non-null object
dtypes: int64(3), object(9)
memory usage: 213.0+ KB


#### Define
name column has majority of the rows ( 745) have None populated.There are also 109 rows that have invalid values for name with values like ( 'the','a','an','such' etc).
1. Gather the values in the name column that start with lowercase. These seem to be invalid values. Store them in a list.
2. Replace these values with null values
3. Replace None with Null values


#### Code

In [48]:
# Create a boolean series for name starting with lowercase letter
name_lower_case = tweet_archive_data_clean['name'].str[0].str.islower()

In [49]:
import numpy as np

In [50]:
# store these name values into an array
invalid_names = tweet_archive_data_clean[name_lower_case].name.values

In [51]:
invalid_names

array(['such', 'a', 'quite', 'quite', 'not', 'one', 'incredibly', 'a', 'a',
       'very', 'my', 'one', 'not', 'his', 'one', 'a', 'a', 'a', 'an',
       'very', 'actually', 'a', 'just', 'getting', 'mad', 'very', 'this',
       'unacceptable', 'all', 'a', 'old', 'a', 'infuriating', 'a', 'a',
       'a', 'an', 'a', 'a', 'very', 'getting', 'just', 'a', 'the', 'the',
       'actually', 'by', 'a', 'officially', 'a', 'the', 'the', 'a', 'a',
       'a', 'a', 'life', 'a', 'one', 'a', 'a', 'a', 'light', 'just',
       'space', 'a', 'the', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
       'an', 'a', 'the', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
       'a', 'quite', 'a', 'an', 'a', 'an', 'the', 'the', 'a', 'a', 'an',
       'a', 'a', 'a', 'a'], dtype=object)

In [52]:
# replace the above values in the name column with null values
tweet_archive_data_clean.name.replace(invalid_names,np.nan, inplace = True)

In [53]:
# replace the "None" value name column with null values
tweet_archive_data_clean.name.replace('None',np.nan, inplace = True)

#### Test

In [54]:
# Create a boolean series for name starting with lowercase letter
tweet_archive_data_clean.name.unique()

array(['Phineas', 'Tilly', 'Archie', 'Darla', 'Franklin', nan, 'Jax',
       'Zoey', 'Cassie', 'Koda', 'Bruno', 'Ted', 'Stuart', 'Oliver', 'Jim',
       'Zeke', 'Ralphus', 'Gerald', 'Jeffrey', 'Canela', 'Maya', 'Mingus',
       'Derek', 'Roscoe', 'Waffles', 'Jimbo', 'Maisey', 'Earl', 'Lola',
       'Kevin', 'Yogi', 'Noah', 'Bella', 'Grizzwald', 'Rusty', 'Gus',
       'Stanley', 'Alfy', 'Koko', 'Rey', 'Gary', 'Elliot', 'Louis',
       'Jesse', 'Romeo', 'Bailey', 'Duddles', 'Jack', 'Steven', 'Beau',
       'Snoopy', 'Shadow', 'Emmy', 'Aja', 'Penny', 'Dante', 'Nelly',
       'Ginger', 'Benedict', 'Venti', 'Goose', 'Nugget', 'Cash', 'Jed',
       'Sebastian', 'Sierra', 'Monkey', 'Harry', 'Kody', 'Lassie', 'Rover',
       'Napolean', 'Boomer', 'Cody', 'Rumble', 'Clifford', 'Dewey',
       'Scout', 'Gizmo', 'Walter', 'Cooper', 'Harold', 'Shikha', 'Lili',
       'Jamesy', 'Coco', 'Sammy', 'Meatball', 'Paisley', 'Albus',
       'Neptune', 'Belle', 'Quinn', 'Zooey', 'Dave', 'Jersey', 'Hobbes',


#### Define
1. Change the timestamp column to date data type

#### Code

In [55]:
tweet_archive_data_clean.timestamp = pd.to_datetime( tweet_archive_data_clean.timestamp,infer_datetime_format=True)

In [56]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id              2097 non-null int64
timestamp             2097 non-null datetime64[ns]
source                2097 non-null object
text                  2097 non-null object
expanded_urls         2094 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  1390 non-null object
doggo                 2097 non-null object
floofer               2097 non-null object
pupper                2097 non-null object
puppo                 2097 non-null object
dtypes: datetime64[ns](1), int64(3), object(8)
memory usage: 213.0+ KB


#### Define
There are 23 rows where rating denominator is not equal to 10
 For consistency purposes I onyl want to keep the records where denominator is  equal to 10.

1. Filter out the rows where ratings denominator is not equal to 10

#### Code

In [57]:
tweet_archive_data_clean = tweet_archive_data_clean[tweet_archive_data_clean.rating_denominator == 10]

#### Test

In [58]:
tweet_archive_data_clean.rating_denominator.unique()

array([10])

#### Define
1. Filter out extreme outliers in Ratings numerator
i.e delete rows where ratings numerator is ( 1776,420,75) any ratings greater than equal to 74

In [59]:
tweet_archive_data_clean.rating_numerator.unique()

array([  13,   12,   14,    5,   11,    6,   10,    0,   75,   27,    3,
          7,    8,    9,    4, 1776,   26,    2,    1,  420])

#### Code

In [60]:
tweet_archive_data_clean = tweet_archive_data_clean[tweet_archive_data_clean.rating_numerator < 74]

#### Test

In [61]:
tweet_archive_data_clean.rating_numerator.unique()

array([13, 12, 14,  5, 11,  6, 10,  0, 27,  3,  7,  8,  9,  4, 26,  2,  1])

#### Define
Delete source and expanded_urls column

#### Code

In [62]:
tweet_archive_data_clean.drop(['source','expanded_urls'],inplace = True, axis=1)

#### Test

In [63]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2077 entries, 0 to 2355
Data columns (total 10 columns):
tweet_id              2077 non-null int64
timestamp             2077 non-null datetime64[ns]
text                  2077 non-null object
rating_numerator      2077 non-null int64
rating_denominator    2077 non-null int64
name                  1385 non-null object
doggo                 2077 non-null object
floofer               2077 non-null object
pupper                2077 non-null object
puppo                 2077 non-null object
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 178.5+ KB


##### Tidiness for twitter_archive_data

#### Define

1. Create a new columns called breed_type and populated it using the data in the columns doggo,floofer,pupper and puppo. 


#### Code

In [64]:
# replace None value with Null
columns = ['doggo', 'floofer','pupper', 'puppo']
for col in columns:
    tweet_archive_data_clean[col].replace('None',np.nan, inplace = True)

In [65]:
tweet_archive_data_clean[columns].head(5)

Unnamed: 0,doggo,floofer,pupper,puppo
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,


In [66]:
def breed_type(in_cols):
    for col in in_cols:
        if col in columns:
            return col
        else:
            pass
    

In [67]:
tweet_archive_data_clean['breed'] = tweet_archive_data_clean[columns].apply(breed_type,axis=1 )

In [68]:
tweet_archive_data_clean[['breed','doggo', 'floofer','pupper', 'puppo']]

Unnamed: 0,breed,doggo,floofer,pupper,puppo
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,doggo,doggo,,,


In [69]:
tweet_archive_data_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], 
    inplace = True, axis = 1)

In [70]:
tweet_archive_data_clean.breed.unique()

array([None, 'doggo', 'puppo', 'pupper', 'floofer'], dtype=object)

#### Test

In [71]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2077 entries, 0 to 2355
Data columns (total 7 columns):
tweet_id              2077 non-null int64
timestamp             2077 non-null datetime64[ns]
text                  2077 non-null object
rating_numerator      2077 non-null int64
rating_denominator    2077 non-null int64
name                  1385 non-null object
breed                 336 non-null object
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 129.8+ KB


#### Quality for  tweet_image_data

#### Define
1. Drop the columns img_num,p1_dog,p2, p2_conf, p2_dog, p3, p3_conf,p3_dog
2. Rename the columns jpg_url, p1, p1_conf and p1_dog to descriptive column names

#### Code

In [72]:
# Create a copy of the data frame
# we do not want to update the original data set
tweet_image_data_clean = tweet_image_data.copy()



In [73]:
tweet_image_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [74]:
tweet_image_data_clean.drop(['img_num','p1_dog','p2', 'p2_conf', 'p2_dog', 'p3', 
                  'p3_conf', 'p3_dog'], inplace = True, axis = 1)

In [75]:
tweet_image_data_clean.rename(columns = {'jpg_url': 'image_url', 
                               'p1': 'prediction', 
                               'p1_conf': 'confidence_level'}, inplace = True)

In [76]:
tweet_image_data_clean.head()

Unnamed: 0,tweet_id,image_url,prediction,confidence_level
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,Welsh_springer_spaniel,0.465074
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,redbone,0.506826
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,German_shepherd,0.596461
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,Rhodesian_ridgeback,0.408143
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,miniature_pinscher,0.560311


#### Tidiness
Join the three data sets into one single data frame

In [77]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2077 entries, 0 to 2355
Data columns (total 7 columns):
tweet_id              2077 non-null int64
timestamp             2077 non-null datetime64[ns]
text                  2077 non-null object
rating_numerator      2077 non-null int64
rating_denominator    2077 non-null int64
name                  1385 non-null object
breed                 336 non-null object
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 129.8+ KB


In [78]:
tweet_image_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 4 columns):
tweet_id            2075 non-null int64
image_url           2075 non-null object
prediction          2075 non-null object
confidence_level    2075 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 64.9+ KB


In [79]:
tweet_count_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2340 entries, 0 to 2339
Data columns (total 3 columns):
tweet_id          2340 non-null int64
retweet_count     2340 non-null int64
favorite_count    2340 non-null int64
dtypes: int64(3)
memory usage: 54.9 KB


#### Code

In [80]:
# Merge the three data frames
twitter_archive_master = tweet_archive_data_clean.merge(tweet_count_data,
                                                   on = 'tweet_id', how = 'inner')
twitter_archive_master = twitter_archive_master.merge(tweet_image_data_clean, 
                                            on = 'tweet_id', how = 'left')

##### Test

In [81]:
twitter_archive_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id              2075 non-null int64
timestamp             2075 non-null datetime64[ns]
text                  2075 non-null object
rating_numerator      2075 non-null int64
rating_denominator    2075 non-null int64
name                  1383 non-null object
breed                 336 non-null object
retweet_count         2075 non-null int64
favorite_count        2075 non-null int64
image_url             1949 non-null object
prediction            1949 non-null object
confidence_level      1949 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(5), object(5)
memory usage: 210.7+ KB


In [82]:
twitter_archive_master.query("tweet_id in ('890729181411237888','890240255349198849')")

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,breed,retweet_count,favorite_count,image_url,prediction,confidence_level
7,890729181411237888,2017-07-28 00:22:40,When you watch your owner call another dog a g...,13,10,,,18422,64098,https://pbs.twimg.com/media/DFyBahAVwAAhUTd.jpg,Pomeranian,0.566142
9,890240255349198849,2017-07-26 15:59:51,This is Cassie. She is a college pup. Studying...,14,10,Cassie,doggo,7209,31290,https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg,Pembroke,0.511319


In [83]:
tweet_image_data_clean.query("tweet_id in ('890729181411237888','890240255349198849')")

Unnamed: 0,tweet_id,image_url,prediction,confidence_level
2065,890240255349198849,https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg,Pembroke,0.511319
2067,890729181411237888,https://pbs.twimg.com/media/DFyBahAVwAAhUTd.jpg,Pomeranian,0.566142


In [84]:
tweet_count_data.query("tweet_id in ('890729181411237888','890240255349198849')")

Unnamed: 0,tweet_id,retweet_count,favorite_count
7,890729181411237888,18422,64098
9,890240255349198849,7209,31290


# Store 

In [85]:
# Store tables to files
tweet_count_data.to_csv('tweet_count_data_clean.csv', encoding='utf-8', index=False)
tweet_image_data_clean.to_csv('tweet_image_data_clean.csv', encoding='utf-8', index=False)
tweet_archive_data_clean.to_csv('tweet_archive_data_clean.csv', encoding='utf-8', index=False)
twitter_archive_master.to_csv('twitter_archive_master.csv', encoding='utf-8', index=False)

### Code for data Analysis and Visualization is in act_report.ipynb