In [4]:
# Importing all important statements
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import requests
import seaborn as sns
import tweepy
from tweepy import OAuthHandler
from timeit import default_timer as timer

from datetime import datetime
from functools import reduce

% matplotlib inline

# Gathering Data

### Twitter Archive (Local File)

In [5]:
# importing the twitter archive file
twitter1 = pd.read_csv('twitter-archive-enhanced.csv')

### Image prediction URL

In [7]:
# importing the image predictions file from the Udacity website
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv' 
response = requests.get(url)
with open(os.path.join('image_predictions.tsv'), mode = 'wb') as file:
    file.write(response.content)

In [8]:
# Load the image predictions file
images = pd.read_csv('image_predictions.tsv', sep = '\t')

### Tweets (Twitter API)

In [5]:
#Credentials for accessing API
consumer_key = 'hidden'
consumer_secret = 'hidden'
access_token = 'hidden'
access_secret = 'hidden'


auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

In [6]:
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [9]:
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = twitter1.tweet_id.values
len(tweet_ids)

2356

In [8]:
# Using the tweet IDs in the Twitter archive, query the Twitter API for each tweet's JSON

start = timer() # start timer

with open('getstatus_error.txt', 'w') as errfile: 
    valid_ids = 0
    err_ids = 0
    tweet_ids = twitter1.tweet_id
    with open('tweet_json.txt', 'w', encoding='utf-8') as outfile:
        for i, tweet_id in tweet_ids.iteritems():
            try:
                print("%s# %s" % (str(i+1), tweet_id))
                # Get tweet data using Twitter API
                tweet = api.get_status(tweet_id, tweet_mode='extended')
                json_content = tweet._json
                
                # Write each tweet's JSON data to its own line in a file
                json.dump(json_content, outfile)
                outfile.write('\n')
                valid_ids += 1
            except tweepy.TweepError as e:
                err_ids += 1
                err_str = []
                err_str.append(str(tweet_id))
                err_str.append(': ')
                err_str.append(e.response.json()['errors'][0]['message'])
                err_str.append('\n')
                errfile.write(''.join(err_str))
                print(''.join(err_str))
                continue
        print("%s %s" % ('Valid tweets:', valid_ids))
        print("%s %s" % ('Error tweets:', err_ids))
        
end = timer() # end timer
print((end - start)/(1000*60))

1# 892420643555336193
2# 892177421306343426
3# 891815181378084864
4# 891689557279858688
5# 891327558926688256
6# 891087950875897856
7# 890971913173991426
8# 890729181411237888
9# 890609185150312448
10# 890240255349198849
11# 890006608113172480
12# 889880896479866881
13# 889665388333682689
14# 889638837579907072
15# 889531135344209921
16# 889278841981685760
17# 888917238123831296
18# 888804989199671297
19# 888554962724278272
20# 888202515573088257
888202515573088257: No status found with that ID.

21# 888078434458587136
22# 887705289381826560
23# 887517139158093824
24# 887473957103951883
25# 887343217045368832
26# 887101392804085760
27# 886983233522544640
28# 886736880519319552
29# 886680336477933568
30# 886366144734445568
31# 886267009285017600
32# 886258384151887873
33# 886054160059072513
34# 885984800019947520
35# 885528943205470208
36# 885518971528720385
37# 885311592912609280
38# 885167619883638784
39# 884925521741709313
40# 884876753390489601
41# 884562892145688576
42# 88444180538

In [10]:
# List of dictionaries to read tweet's JSON data line by line and later convert to a DataFrame
df_list = []
with open('tweet_json.txt', 'r') as json_file:
    for line in json_file:
        status = json.loads(line)
        
        # Append to list of dictionaries
        df_list.append({'tweet_id': status['id'],
                        'retweet_count': status['retweet_count'],
                        'favorite_count': status['favorite_count'],
                        'display_text_range': status['display_text_range']
                       })

# Create a DataFrame with tweet ID, retweet count, favorite count and display_text_range
twitter2 = pd.DataFrame(df_list, columns = ['tweet_id', 'retweet_count', 'favorite_count', 'display_text_range'])

# Assessing Data

In [11]:
twitter1.sample(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
135,866450705531457537,,,2017-05-22 00:28:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Jamesy. He gives a kiss to every other...,,,,https://twitter.com/dog_rates/status/866450705...,13,10,Jamesy,,,pupper,
1153,725729321944506368,,,2016-04-28 16:52:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Lola. She's a Butternut Splishnsplash....,,,,https://twitter.com/dog_rates/status/725729321...,12,10,Lola,,,,
542,806219024703037440,,,2016-12-06 19:29:28 +0000,"<a href=""http://twitter.com/download/iphone"" r...",We only rate dogs. Please stop sending in non-...,,,,https://twitter.com/dog_rates/status/806219024...,11,10,incredibly,,,,


In [11]:
twitter1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [12]:
twitter1['tweet_id'].value_counts()

749075273010798592    1
741099773336379392    1
798644042770751489    1
825120256414846976    1
769212283578875904    1
700462010979500032    1
780858289093574656    1
699775878809702401    1
880095782870896641    1
760521673607086080    1
776477788987613185    1
691820333922455552    1
715696743237730304    1
714606013974974464    1
760539183865880579    1
813157409116065792    1
676430933382295552    1
743510151680958465    1
837012587749474308    1
833722901757046785    1
818259473185828864    1
670704688707301377    1
667160273090932737    1
674394782723014656    1
672082170312290304    1
670093938074779648    1
759923798737051648    1
809920764300447744    1
805487436403003392    1
838085839343206401    1
                     ..
763956972077010945    1
870308999962521604    1
720775346191278080    1
785927819176054784    1
783347506784731136    1
775733305207554048    1
834209720923721728    1
825026590719483904    1
758405701903519748    1
668986018524233728    1
6909388994772213

In [13]:
twitter1['source'].value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64

In [14]:
twitter1['text'].value_counts()

This is Comet. He's a Wild Estonian Poofer. Surprised they caught him. 12/10 would pet well https://t.co/tlfuZ25IMi                                                   1
RT @dog_rates: This is Sampson. He just graduated. Ready to be a doggo now. Time for the real world. 12/10 have fun with taxes https://t.co…                          1
RT @dog_rates: We normally don't rate marshmallows but this one appears to be flawlessly toasted so I'll make an exception. 10/10 https://t…                          1
Meet Winston. He's trapped in a cup of coffee. Poor pupper. 10/10 someone free him https://t.co/2e6cUtKUuc                                                            1
This is Lucy. She destroyed not one, but two remotes trying to turn off the debate. 11/10 relatable af https://t.co/3BXh073tDm                                        1
This is Fwed. He is a Canadian Asian Taylormade. Was having a blast until pink spiky football attacked. 8/10 https://t.co/A37eGLz5WS                            

In [13]:
twitter1['retweeted_status_id'].value_counts()

7.757333e+17    1
7.507196e+17    1
6.742918e+17    1
6.833919e+17    1
8.269587e+17    1
8.780576e+17    1
7.320056e+17    1
7.186315e+17    1
6.732953e+17    1
7.914070e+17    1
8.447048e+17    1
7.862340e+17    1
8.685523e+17    1
7.504293e+17    1
8.327664e+17    1
6.690004e+17    1
6.873173e+17    1
7.638376e+17    1
7.815247e+17    1
8.092201e+17    1
8.000650e+17    1
8.174239e+17    1
8.001414e+17    1
7.909461e+17    1
7.867091e+17    1
8.406323e+17    1
8.688804e+17    1
7.869631e+17    1
7.733088e+17    1
8.222448e+17    1
               ..
6.816941e+17    1
7.899865e+17    1
7.939622e+17    1
6.800555e+17    1
7.128090e+17    1
8.164506e+17    1
6.769365e+17    1
6.675487e+17    1
8.083449e+17    1
7.626999e+17    1
6.678667e+17    1
8.479710e+17    1
6.820881e+17    1
8.352641e+17    1
7.761133e+17    1
7.902771e+17    1
6.675484e+17    1
7.677549e+17    1
8.782815e+17    1
8.663350e+17    1
7.399792e+17    1
7.403732e+17    1
8.395493e+17    1
7.001438e+17    1
8.482894e+

In [14]:
twitter1['retweeted_status_user_id'].value_counts()

4.196984e+09    156
4.296832e+09      2
5.870972e+07      1
6.669901e+07      1
4.119842e+07      1
7.475543e+17      1
7.832140e+05      1
7.266347e+08      1
4.871977e+08      1
5.970642e+08      1
4.466750e+07      1
1.228326e+09      1
7.992370e+07      1
2.488557e+07      1
7.874618e+17      1
3.638908e+08      1
5.128045e+08      1
8.117408e+08      1
1.732729e+09      1
1.960740e+07      1
1.547674e+08      1
3.410211e+08      1
7.124572e+17      1
2.804798e+08      1
1.950368e+08      1
Name: retweeted_status_user_id, dtype: int64

In [17]:
twitter1['retweeted_status_timestamp'].value_counts()

2016-09-01 16:14:48 +0000    1
2016-10-27 16:06:04 +0000    1
2016-05-16 00:31:53 +0000    1
2016-06-07 00:36:02 +0000    1
2015-11-19 00:32:12 +0000    1
2017-06-23 01:10:23 +0000    1
2017-02-24 23:04:14 +0000    1
2016-12-17 22:43:27 +0000    1
2017-01-20 00:50:15 +0000    1
2017-05-06 17:49:42 +0000    1
2016-07-05 20:41:01 +0000    1
2015-12-12 02:23:01 +0000    1
2015-12-16 01:27:03 +0000    1
2016-10-23 00:27:05 +0000    1
2015-12-02 03:40:57 +0000    1
2015-12-21 22:15:18 +0000    1
2017-04-26 02:37:47 +0000    1
2017-07-15 02:44:07 +0000    1
2016-06-08 02:41:38 +0000    1
2015-12-06 00:17:55 +0000    1
2016-09-28 00:46:20 +0000    1
2017-03-16 00:00:07 +0000    1
2016-08-08 01:44:46 +0000    1
2016-10-06 15:49:14 +0000    1
2016-09-25 00:06:08 +0000    1
2016-10-13 23:23:56 +0000    1
2016-12-17 00:38:52 +0000    1
2016-01-08 05:00:14 +0000    1
2016-09-09 18:31:54 +0000    1
2015-12-11 16:40:19 +0000    1
                            ..
2016-06-10 00:39:48 +0000    1
2016-10-

In [18]:
twitter1['expanded_urls'].value_counts()

https://twitter.com/dog_rates/status/859196978902773760/video/1                                                                                                                                                                                                                                              2
https://twitter.com/dog_rates/status/844704788403113984/photo/1                                                                                                                                                                                                                                              2
https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1                                              2
https://twitter.com/dog_rates/status/739979191639244800/photo/1                            

In [19]:
twitter1['rating_numerator'].value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

In [20]:
twitter1['rating_denominator'].value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

In [21]:
twitter1['name'].value_counts()

None          745
a              55
Charlie        12
Cooper         11
Oliver         11
Lucy           11
Tucker         10
Lola           10
Penny          10
Winston         9
Bo              9
Sadie           8
the             8
Toby            7
Buddy           7
Daisy           7
Bailey          7
an              7
Bella           6
Scout           6
Jax             6
Jack            6
Milo            6
Leo             6
Oscar           6
Stanley         6
Rusty           6
Koda            6
Dave            6
Sunny           5
             ... 
Ridley          1
Mairi           1
Cedrick         1
Opie            1
Walker          1
Karl            1
Brandonald      1
Pepper          1
Chef            1
Arlen           1
Dex             1
Jazzy           1
Crouton         1
Goose           1
Bloop           1
Gin             1
Marvin          1
Mo              1
Flash           1
Sobe            1
Roscoe          1
officially      1
Grizz           1
Randall         1
Meera     

In [22]:
twitter1['doggo'].value_counts()

None     2259
doggo      97
Name: doggo, dtype: int64

In [23]:
twitter1['floofer'].value_counts()

None       2346
floofer      10
Name: floofer, dtype: int64

In [24]:
twitter1['pupper'].value_counts()

None      2099
pupper     257
Name: pupper, dtype: int64

In [25]:
twitter1['puppo'].value_counts()

None     2326
puppo      30
Name: puppo, dtype: int64

In [15]:
twitter1.loc[(twitter1[['doggo', 'floofer', 'pupper', 'puppo']] != 'None'
              ).sum(axis=1)>1]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
191,855851453814013952,,,2017-04-22 18:31:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a puppo participating in the #ScienceMa...,,,,https://twitter.com/dog_rates/status/855851453...,13,10,,doggo,,,puppo
200,854010172552949760,,,2017-04-17 16:34:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...","At first I thought this was a shy doggo, but i...",,,,https://twitter.com/dog_rates/status/854010172...,11,10,,doggo,floofer,,
460,817777686764523521,,,2017-01-07 16:59:28 +0000,"<a href=""http://twitter.com/download/iphone"" r...","This is Dido. She's playing the lead role in ""...",,,,https://twitter.com/dog_rates/status/817777686...,13,10,Dido,doggo,,pupper,
531,808106460588765185,,,2016-12-12 00:29:28 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have Burke (pupper) and Dexter (doggo)...,,,,https://twitter.com/dog_rates/status/808106460...,12,10,,doggo,,pupper,
565,802265048156610565,7.331095e+17,4196984000.0,2016-11-25 21:37:47 +0000,"<a href=""http://twitter.com/download/iphone"" r...","Like doggo, like pupper version 2. Both 11/10 ...",,,,https://twitter.com/dog_rates/status/802265048...,11,10,,doggo,,pupper,
575,801115127852503040,,,2016-11-22 17:28:25 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bones. He's being haunted by another d...,,,,https://twitter.com/dog_rates/status/801115127...,12,10,Bones,doggo,,pupper,
705,785639753186217984,,,2016-10-11 00:34:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Pinot. He's a sophisticated doggo. You...,,,,https://twitter.com/dog_rates/status/785639753...,10,10,Pinot,doggo,,pupper,
733,781308096455073793,,,2016-09-29 01:42:20 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...","Pupper butt 1, Doggo 0. Both 12/10 https://t.c...",,,,https://vine.co/v/5rgu2Law2ut,12,10,,doggo,,pupper,
778,775898661951791106,,,2016-09-14 03:27:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...","RT @dog_rates: Like father (doggo), like son (...",7.331095e+17,4196984000.0,2016-05-19 01:38:16 +0000,https://twitter.com/dog_rates/status/733109485...,12,10,,doggo,,pupper,
822,770093767776997377,,,2016-08-29 03:00:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is just downright precious...,7.410673e+17,4196984000.0,2016-06-10 00:39:48 +0000,https://twitter.com/dog_rates/status/741067306...,12,10,just,doggo,,pupper,


In [16]:
twitter1.loc[twitter1['name'].str.isupper()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
775,776201521193218049,,,2016-09-14 23:30:38 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is O'Malley. That is how he sleeps. Doesn...,,,,https://twitter.com/dog_rates/status/776201521...,10,10,O,,,,
2041,671542985629241344,,,2015-12-01 04:14:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...","This is JD (stands for ""just dog""). He's like ...",,,,https://twitter.com/dog_rates/status/671542985...,10,10,JD,,,,


In [17]:
twitter2.sample(5)

Unnamed: 0,tweet_id,retweet_count,favorite_count,display_text_range
7,890729181411237888,18209,63409,"[0, 118]"
1215,712809025985978368,7038,19137,"[0, 63]"
709,782747134529531904,1506,7893,"[0, 89]"
1550,687818504314159109,1002,2584,"[0, 75]"
1720,679511351870550016,1332,3490,"[0, 133]"


In [11]:
twitter2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2337 entries, 0 to 2336
Data columns (total 4 columns):
tweet_id              2337 non-null int64
retweet_count         2337 non-null int64
favorite_count        2337 non-null int64
display_text_range    2337 non-null object
dtypes: int64(3), object(1)
memory usage: 73.1+ KB


In [12]:
twitter2['tweet_id'].value_counts()

749075273010798592    1
741099773336379392    1
798644042770751489    1
825120256414846976    1
769212283578875904    1
700462010979500032    1
780858289093574656    1
699775878809702401    1
880095782870896641    1
760521673607086080    1
776477788987613185    1
691820333922455552    1
715696743237730304    1
714606013974974464    1
760539183865880579    1
813157409116065792    1
676430933382295552    1
743510151680958465    1
833722901757046785    1
805487436403003392    1
759923798737051648    1
674742531037511680    1
670704688707301377    1
667160273090932737    1
674394782723014656    1
672082170312290304    1
670093938074779648    1
818259473185828864    1
842765311967449089    1
809920764300447744    1
                     ..
720775346191278080    1
879492040517615616    1
785927819176054784    1
834209720923721728    1
857062103051644929    1
825026590719483904    1
758405701903519748    1
668986018524233728    1
690938899477221376    1
678800283649069056    1
6815798356684554

In [13]:
twitter2['favorite_count'].value_counts()

0        166
2163       3
1432       3
4119       3
2364       3
1193       3
235        3
2738       3
1121       3
1429       3
121        3
3038       3
2871       3
2008       3
16348      2
2688       2
1653       2
2276       2
2314       2
243        2
897        2
251        2
2803       2
3212       2
3214       2
1604       2
304        2
11489      2
8554       2
615        2
        ... 
1564       1
4739       1
1741       1
27258      1
19010      1
21059      1
2628       1
2630       1
2632       1
7306       1
589        1
6734       1
6736       1
4689       1
17183      1
2822       1
4697       1
35418      1
603        1
2652       1
4701       1
4045       1
2267       1
4713       1
2668       1
4721       1
4723       1
2114       1
12919      1
8231       1
Name: favorite_count, Length: 1997, dtype: int64

In [14]:
twitter2['retweet_count'].value_counts()

477     6
245     5
501     5
373     4
57      4
1038    4
2775    4
443     4
2112    4
555     4
486     4
464     4
530     4
88      4
222     4
46      4
925     3
3618    3
399     3
379     3
4459    3
347     3
1451    3
137     3
217     3
127     3
113     3
105     3
4195    3
97      3
       ..
1874    1
7270    1
497     1
1890    1
8039    1
1123    1
1902    1
3951    1
707     1
3963    1
3965    1
3907    1
1133    1
1850    1
1824    1
3841    1
3845    1
7949    1
1137    1
435     1
1135    1
3873    1
1846    1
1826    1
1830    1
3879    1
5928    1
1838    1
1842    1
3929    1
Name: retweet_count, Length: 1707, dtype: int64

In [15]:
images.sample(5)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
372,672984142909456390,https://pbs.twimg.com/media/CVbrcZyVAAA5Wpq.jpg,1,wombat,0.73878,False,beaver,0.133368,False,wallaby,0.03237,False
257,670778058496974848,https://pbs.twimg.com/media/CU8VFhuVAAAQW8B.jpg,1,pug,0.776612,True,Brabancon_griffon,0.112032,True,boxer,0.039051,True
346,672466075045466113,https://pbs.twimg.com/media/CVUUU_EWoAAxABV.jpg,1,cocker_spaniel,0.150424,True,toy_poodle,0.088605,True,Welsh_springer_spaniel,0.072014,True
812,692530551048294401,https://pbs.twimg.com/media/CZxc3G7WEAAM4Mv.jpg,1,Siberian_husky,0.486428,True,Eskimo_dog,0.448518,True,white_wolf,0.041506,False
64,667171260800061440,https://pbs.twimg.com/media/CUJEuRIXIAAPDLt.jpg,1,giant_schnauzer,0.841265,True,Lakeland_terrier,0.052744,True,Irish_water_spaniel,0.034402,True


In [16]:
images.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [17]:
images['tweet_id'].value_counts()

685532292383666176    1
826598365270007810    1
692158366030913536    1
714606013974974464    1
715696743237730304    1
776477788987613185    1
772114945936949249    1
699775878809702401    1
780858289093574656    1
700462010979500032    1
732726085725589504    1
738883359779196928    1
798644042770751489    1
743510151680958465    1
837012587749474308    1
833722901757046785    1
668620235289837568    1
842765311967449089    1
685315239903100929    1
673686845050527744    1
680473011644985345    1
666051853826850816    1
675853064436391936    1
693231807727280129    1
705475953783398401    1
829449946868879360    1
759923798737051648    1
667160273090932737    1
680934982542561280    1
743545585370791937    1
                     ..
794926597468000259    1
776113305656188928    1
825026590719483904    1
834209720923721728    1
775733305207554048    1
669564461267722241    1
879492040517615616    1
720775346191278080    1
666362758909284353    1
750506206503038976    1
6931556864910008

In [18]:
images['jpg_url'].value_counts()

https://pbs.twimg.com/media/CzG425nWgAAnP7P.jpg                                            2
https://pbs.twimg.com/media/C12x-JTVIAAzdfl.jpg                                            2
https://pbs.twimg.com/media/CxqsX-8XUAAEvjD.jpg                                            2
https://pbs.twimg.com/tweet_video_thumb/CeBym7oXEAEWbEg.jpg                                2
https://pbs.twimg.com/media/CtVAvX-WIAAcGTf.jpg                                            2
https://pbs.twimg.com/media/C2oRbOuWEAAbVSl.jpg                                            2
https://pbs.twimg.com/media/CdHwZd0VIAA4792.jpg                                            2
https://pbs.twimg.com/media/CmoPdmHW8AAi8BI.jpg                                            2
https://pbs.twimg.com/media/Cwx99rpW8AMk_Ie.jpg                                            2
https://pbs.twimg.com/ext_tw_video_thumb/675354114423808004/pu/img/qL1R_nGLqa6lmkOx.jpg    2
https://pbs.twimg.com/media/CW88XN4WsAAlo8r.jpg                       

In [19]:
images['img_num'].value_counts()

1    1780
2     198
3      66
4      31
Name: img_num, dtype: int64

In [18]:
images['p1'].value_counts()

golden_retriever             150
Labrador_retriever           100
Pembroke                      89
Chihuahua                     83
pug                           57
chow                          44
Samoyed                       43
toy_poodle                    39
Pomeranian                    38
malamute                      30
cocker_spaniel                30
French_bulldog                26
Chesapeake_Bay_retriever      23
miniature_pinscher            23
seat_belt                     22
Staffordshire_bullterrier     20
German_shepherd               20
Siberian_husky                20
web_site                      19
Cardigan                      19
teddy                         18
Maltese_dog                   18
beagle                        18
Eskimo_dog                    18
Shetland_sheepdog             18
Lakeland_terrier              17
Shih-Tzu                      17
Rottweiler                    17
Italian_greyhound             16
kuvasz                        16
          

In [21]:
images['p1_conf'].value_counts()

0.366248    2
0.713293    2
0.375098    2
0.636169    2
0.611525    2
0.420463    2
0.581403    2
0.403698    2
0.530104    2
0.254856    2
0.346545    2
0.721188    2
0.677408    2
0.907083    2
0.243529    2
0.505370    2
0.593858    2
0.372202    2
0.274637    2
0.600276    2
0.506312    2
0.615163    2
0.556595    2
0.995143    2
0.809197    2
0.964929    2
0.777468    2
0.336200    2
0.617389    2
0.786089    2
           ..
0.483228    1
0.556524    1
0.176423    1
0.318981    1
0.733025    1
0.730152    1
0.436023    1
0.479008    1
0.162935    1
0.897162    1
0.320420    1
0.999833    1
0.995873    1
0.523206    1
0.942911    1
0.537652    1
0.672791    1
0.952258    1
0.855959    1
0.665578    1
0.841265    1
0.668164    1
0.946828    1
0.714719    1
0.352946    1
0.713102    1
0.765266    1
0.491022    1
0.905334    1
1.000000    1
Name: p1_conf, Length: 2006, dtype: int64

In [22]:
images['p1_dog'].value_counts()

True     1532
False     543
Name: p1_dog, dtype: int64

In [23]:
images['p2'].value_counts()

Labrador_retriever                104
golden_retriever                   92
Cardigan                           73
Chihuahua                          44
Pomeranian                         42
French_bulldog                     41
Chesapeake_Bay_retriever           41
toy_poodle                         37
cocker_spaniel                     34
miniature_poodle                   33
Siberian_husky                     33
beagle                             28
Eskimo_dog                         27
collie                             27
Pembroke                           27
kuvasz                             26
Italian_greyhound                  22
Pekinese                           21
American_Staffordshire_terrier     21
toy_terrier                        20
Samoyed                            20
chow                               20
malinois                           20
miniature_pinscher                 20
Boston_bull                        19
Norwegian_elkhound                 19
Staffordshir

In [24]:
images['p2_conf'].value_counts()

0.069362    3
0.027907    2
0.193654    2
0.271929    2
0.003143    2
0.197021    2
0.347609    2
0.151047    2
0.052724    2
0.153126    2
0.119256    2
0.227150    2
0.057091    2
0.149950    2
0.025119    2
0.165930    2
0.190503    2
0.012763    2
0.181351    2
0.325106    2
0.020089    2
0.172844    2
0.142204    2
0.152445    2
0.052956    2
0.099984    2
0.093940    2
0.252706    2
0.140798    2
0.130611    2
           ..
0.083513    1
0.100988    1
0.038062    1
0.317368    1
0.256433    1
0.057883    1
0.098354    1
0.250014    1
0.088474    1
0.178088    1
0.053008    1
0.052396    1
0.165655    1
0.182538    1
0.074962    1
0.120530    1
0.169758    1
0.119745    1
0.090938    1
0.071665    1
0.139346    1
0.036575    1
0.073101    1
0.118181    1
0.000077    1
0.138331    1
0.254884    1
0.090644    1
0.219323    1
0.016301    1
Name: p2_conf, Length: 2004, dtype: int64

In [25]:
images['p2_dog'].value_counts()

True     1553
False     522
Name: p2_dog, dtype: int64

In [19]:
images['p3'].value_counts()

Labrador_retriever                79
Chihuahua                         58
golden_retriever                  48
Eskimo_dog                        38
kelpie                            35
kuvasz                            34
chow                              32
Staffordshire_bullterrier         32
cocker_spaniel                    31
beagle                            31
Pomeranian                        29
toy_poodle                        29
Pekinese                          29
Chesapeake_Bay_retriever          27
Great_Pyrenees                    27
Pembroke                          27
malamute                          26
French_bulldog                    26
American_Staffordshire_terrier    24
Cardigan                          23
pug                               23
basenji                           21
toy_terrier                       20
bull_mastiff                      20
Siberian_husky                    19
Boston_bull                       17
Shetland_sheepdog                 17
b

In [None]:
images['p3_conf'].value_counts()

In [None]:
images['p3_dog'].value_counts()

### Listed Quality Issues:

#### Dataframe twitter1:<br>
1.Data contains retweets (ie. rows where retweeted_status_id and retweeted_status_user_id have a number instead of NaN)<br>
2.tweet_id is an integer<br>
3.timestamp and retweeted_status_timestamp are currently of type 'object'<br>
4.name has values that are the string "None" instead of NaN<br>
5.Some names are inaccurate such as "a", "an", "the", "very", "by", etc. Looking visually in Excel, I was able to find more names that are inaccurate including "actually", "quite", "unacceptable", "mad", "not" and "old. It seems like the method used to extract the names was using the word the followed "This is..." and "Here is..." which leads to some inaccuracies.<br>
6.Found an instance of a name being "O" instead of "O'Malley"<br>
7.doggo, floofer, pupper, and puppo have values that are the string "None" instead of NaN<br>
8.Upon visual inspection in Excel, there are ratings that are incorrect. I ordered the ratings from low to high and looked at the extremes only for incorrect ratings therefore there are likely more than I missed and will be difficult to find them all programmatically. Examples where things may have gone wrong is the use of decimals, or when two instances of numbers separated by a slash are present in 1 text and I assume the first was chosen. Also, there are ratings with decimals such as 13.5/10, 9.5/10 have been incorrectly extracted as 5/10 (in addition to other numbers with decimals such as 11.26 and 11.27). There are instances of 1/2 and 50/50 which are not ratings such signifying "half" which have been considered as ratings. Finally, use of 4/20 and 24/7 has been confused as ratings.<br>
9.There are many columns in this dataframe making it hard to read, and some will not be needed for analysis.<br>


#### Dataframe twitter2:<br>

There are 11 missing tweets compared to the twitter1 datagrame (Might have been deleted)

#### images Dataframe

1.There are 2356 tweets in the twitter1 dataframe and 2075 rows in the images dataframe. This could mean that there is missing data, or that not all 2356 of the tweets had pictures.<br>
2.tweet_id is an integer<br>
3.p1, p2, and p3 contain underscores instead of spaces in the labels<br>

### Listed Tidiness Issues:

#### Dataframe twitter1: 


1 variable (dog stage) in 4 different columns (doggo, floofer, pupper, and puppo)

#### Dataframe twitter2:

twitter2 data should be combined with the twitter1 data since they are information about the same tweet

#### Images Dataframe:

images data could be combined with the twitter1 data as well since it is all information about 1 tweet

# Cleaning Data

In [20]:
# Make copies of the dataframes for cleaning

twitter1_clean = twitter1.copy()
twitter2_clean = twitter2.copy()
images_clean = images.copy()

### Tidiness issue 1 - Merge all dataframes

#### Issue: 
twitter2 and images data should be combined with the twitter1 data since they are information about the same tweet<br>
#### Solution:
Merge the 3 datasets together on 'tweet_id' using the reduce function.

#### Code:

In [23]:
# Merge the twitter1, twitter2, and images dataframes on 'tweet_id'
dfs = [twitter1_clean, twitter2_clean, images_clean]
twitter = reduce(lambda left,right: pd.merge(left,right,on='tweet_id'), dfs)

#### Test:

In [24]:
# View first few lines of new twitter dataframe to ensure the columns have merged
twitter.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,1,orange,0.097049,False,bagel,0.085851,False,banana,0.07611,False
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,...,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


In [25]:
# View info of new twitter dataframe to ensure the columns have merged
twitter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2065 entries, 0 to 2064
Data columns (total 31 columns):
tweet_id                      2065 non-null int64
in_reply_to_status_id         23 non-null float64
in_reply_to_user_id           23 non-null float64
timestamp                     2065 non-null object
source                        2065 non-null object
text                          2065 non-null object
retweeted_status_id           75 non-null float64
retweeted_status_user_id      75 non-null float64
retweeted_status_timestamp    75 non-null object
expanded_urls                 2065 non-null object
rating_numerator              2065 non-null int64
rating_denominator            2065 non-null int64
name                          2065 non-null object
doggo                         2065 non-null object
floofer                       2065 non-null object
pupper                        2065 non-null object
puppo                         2065 non-null object
retweet_count                 2065 

### Tidiness issue 2 - One column for dog stages

#### Define:
Merging the doggo, floofer, pupper and puppo columns to a stage column. Convert the datatype from string to categorical as it helps with analysis.

#### Code:

#### Issue:
1 variable (dog stage) in 4 different columns (doggo, floofer, pupper, and puppo)<br> 
#### Solution: 
doggo, floofer, pupper and puppo columns in twitter table should be merged into one column named "stage"

#### Code:

In [26]:

# merge the doggo, floofer, pupper and puppo columns to a stage column
twitter['stage'] = twitter[['doggo', 'floofer', 'pupper', 'puppo']].max(axis=1)

In [27]:
#converting stage into catagorical datatype
twitter.stage = twitter.stage.astype('category')

In [28]:
# Drop doggo, floofer, pupper, and puppo columns
twitter.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1,inplace=True)

#### Test:

In [29]:
# Check for presence of 'dog_stage" and absence of 'doggo', 'floofer', 
# 'pupper', and 'puppo'
twitter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2065 entries, 0 to 2064
Data columns (total 28 columns):
tweet_id                      2065 non-null int64
in_reply_to_status_id         23 non-null float64
in_reply_to_user_id           23 non-null float64
timestamp                     2065 non-null object
source                        2065 non-null object
text                          2065 non-null object
retweeted_status_id           75 non-null float64
retweeted_status_user_id      75 non-null float64
retweeted_status_timestamp    75 non-null object
expanded_urls                 2065 non-null object
rating_numerator              2065 non-null int64
rating_denominator            2065 non-null int64
name                          2065 non-null object
retweet_count                 2065 non-null int64
favorite_count                2065 non-null int64
display_text_range            2065 non-null object
jpg_url                       2065 non-null object
img_num                       2065 no

### Quality issue 1 - Remove retweets:

#### Define:

#### Issue: 
Data contains retweets<br>
#### Solution: 
Rows where 'retweeted_status_id' is a NaN will be kept (ie. if it has a value it will be removed), then the 'retweeted_status_id', 'retweeted_status_user_id' and 'retweeted_status_timestamp' columns will be removed.

#### Code:

In [None]:
# Keep columns where 'retweeted_status_id' is NaN
twitter = twitter[np.isnan(twitter.retweeted_status_id)]

# Check info to determine all there are no non-null entries for
# 'retweeted_status_id', 'retweeted_status_user_id' and 
# 'retweeted_status_timestamp'
print(twitter.info())

# Remove 'retweeted_status_id', 'retweeted_status_user_id' and 
# 'retweeted_status_timestamp' columns
twitter = twitter.drop(['retweeted_status_id', 
                        'retweeted_status_user_id', 
                        'retweeted_status_timestamp'], 
                       axis=1)

#### Test:

In [None]:
# Check info to ensure the 'retweeted_status_id', 'retweeted_status_user_id' 
# and 'retweeted_status_timestamp' columns have been dropped
twitter.info()

### Quality issue 2 - tweet_id is an integer:

#### Define:

#### Issue: 
tweet_id is an integer<br>
#### Solution: 
Change tweet_id to a string/object

#### Code:

In [None]:
# Change 'tweet_id' to a string
twitter['tweet_id'] = twitter['tweet_id'].astype(str)

#### Test:

In [None]:
# Check info to confirm tweet_id has been changed from an
# integer to a string/object
twitter.info()

### Quality issue 3 - 'timestamp' type:

#### Define:

#### Issue: 
'timestamp' and 'retweeted_status_timestamp' are currently of type 'object'<br>
#### Solution:
Convert 'timestamp' to a datetime object ('retweeted_status_timestamp' was deleted)

#### Code:

In [None]:
# Remove the time zone information from 'timestamp' column
twitter['timestamp'] = twitter['timestamp'].str.slice(start=0, stop=-6)

In [None]:
# Change the 'timestamp' column to a datetime object
twitter['timestamp'] = pd.to_datetime(twitter['timestamp'], format = "%Y-%m-%d %H:%M:%S")

#### Test:

In [None]:
# Confirm that 'timestamp' column is now a datetime object
twitter.info()

### Quality issue 4/5/6 - Various issues with 'name':

#### Define:

#### Issue:

1.name has values that are the string "None" instead of NaN<br>
2.Looking programmatically, some names are inaccurate such as "a", "an", "the", "very", "by", etc. Looking visually in Excel, I was able to find more names that are inaccurate including "actually", "quite", "unacceptable", "mad", "not" and "old. It seems like the method used to extract the names was using the word the followed "This is..." and "Here is..." which leads to some inaccuracies.<br>
3.I also found an instand of a name being "O" instead of "O'Malley"<br> 
#### Solution: 
Replace all inaccurate names with NaNs, and the "O" with "O'Malley".<br>

#### Code:

In [None]:
# Find all names that start with a lowercase letter
lowercase_names = []
for row in twitter['name']:
    if row[0].islower() and row not in lowercase_names:
        lowercase_names.append(row)
print(lowercase_names)

In [None]:
# Replace all names that start with a lowercase letter with a NaN
twitter['name'].replace(lowercase_names, 
                        np.nan,
                       inplace = True)

# Replace all 'None's with a NaN
twitter['name'].replace('None', 
                        np.nan,
                       inplace = True)

# Replace the name 'O' with "O'Malley" 
twitter['name'].replace('O', 
                        "O'Malley",
                       inplace = True)

#### Test:

In [None]:
# Check value counts to see that None and names starting with 
# a lowercase letter are gone
twitter['name'].value_counts()

### Quality issue 7 - Inaccurate ratings:

#### Define:

#### Issue:
Some ratings with decimals such as 13.5/10, 9.5/10 have been incorrectly exported as 5/10 (in addition to other numbers with decimals such as 11.26 and 11.27).<br>

#### Solution: 
Find all instances of ratings that contained decimals and replace the numerator values with the correct values.

#### Code:

In [None]:
# Obtain all text, indices, and ratings for tweets that contain a decimal 
# in the numerator of rating
ratings_with_decimals_text = []
ratings_with_decimals_index = []
ratings_with_decimals = []

for i, text in twitter['text'].iteritems():
    if bool(re.search('\d+\.\d+\/\d+', text)):
        ratings_with_decimals_text.append(text)
        ratings_with_decimals_index.append(i)
        ratings_with_decimals.append(re.search('\d+\.\d+', text).group())

# Print the text to confirm presence of ratings with decimals        
ratings_with_decimals_text


In [None]:
# Print the index of text with decimal ratings
ratings_with_decimals_index

In [None]:
# Change contents of 'rating_numerator' based on info found in 
# preceeding 2 cells
twitter.loc[ratings_with_decimals_index[0],'rating_numerator'] = float(ratings_with_decimals[0])
twitter.loc[ratings_with_decimals_index[1],'rating_numerator'] = float(ratings_with_decimals[1])
twitter.loc[ratings_with_decimals_index[2],'rating_numerator'] = float(ratings_with_decimals[2])
twitter.loc[ratings_with_decimals_index[3],'rating_numerator'] = float(ratings_with_decimals[3])

#### Test:

In [None]:
# Check contents of row with index 40 to ensure the rating is corrected
twitter.loc[40]

### Quality issue 8 - Unstandardized ratings:

#### Define:

#### Code:

In [None]:
# Calulate the value of 'rating'
twitter['rating'] = twitter['rating_numerator'] / twitter['rating_denominator']

#### Test:

In [None]:
# Check for presence of new column and correct calculation
twitter.head(1)

### Storing Dataframe:

In [None]:
# Save twitter dataframe to a CSV file
twitter.to_csv('twitter_archive_master.csv')

### Analyzation and Visualization:

In [None]:
# Plot scatterplot of retweet vs favorite count
sns.lmplot(x="retweet_count", 
           y="favorite_count", 
           data=twitter,
           size = 5,
           aspect=1.3,
           scatter_kws={'alpha':1/5})
plt.title('Favorite vs. Retweet Count')
plt.xlabel('Retweet Count')
plt.ylabel('Favorite Count');

Favorite and retweet counts are highly positively correlated. For about every 4 favorites there is 1 retweet. The majority of the data falls below 40000 favorites and 10000 retweets.

### Retweet Counts:

In [None]:
twitter.retweet_count.describe()

In [None]:
def set_my_palette():
    sns.set()
    current_palette = sns.color_palette(my_palette)
    sns.set_palette(current_palette)

In [None]:
my_palette = ['#66b3ff', '#00cc99', '#ff6666', '#ffff66', '#8c66ff', '#66ffd9']
set_my_palette()
twitter.retweet_count.hist();

In [None]:
twitter[twitter.retweet_count <= 20000].retweet_count.hist();

In [None]:
twitter[twitter.retweet_count <= 2500].retweet_count.hist();

### Favorites Count:

In [None]:
twitter.favorite_count.describe()

In [None]:
twitter.favorite_count.hist();

In [None]:
twitter[twitter.favorite_count <= 40000].favorite_count.hist();

In [None]:
twitter[twitter.favorite_count <= 5000].favorite_count.hist();

In [None]:
# There are 3 outliers with a rating over 2.  
# Find them to see if they are inaccurate or just unique.
twitter.loc[twitter['rating'] > 2]

The first instance is a rating of 24/7 which is inaccurate and I had found when looking for quality errors but decided not to fix. If we were required to fix all problems then thix tweet could be removed but I will ignore in this case. The other 2 tweets are accurate ratings that are jokes. I am more interested if the tweets with a standardized rating of below 1 decrease over time therefore I will limit the y axis from 0 to 2.

In [None]:
# Plot standardized ratings over time with ylim of 0-2
twitter.groupby('timestamp')['rating'].mean().plot(kind='line', figsize=[15,6])
plt.ylim(0, 2)
plt.title('Rating over Time')
plt.xlabel('Time')
plt.ylabel('Standardized Rating')
plt.show;

Indeed it appears that overtime the frequency of ratings below 1 decreases. Before 2016-11 there many ratings below 1, while after that time there are barely any.