# Gather

#### Download the file 'twitter-archive-enhanced.csv' from the link [twitter_archive_enhanced.csv](https://d17h27t6h515a5.cloudfront.net/topher/2017/August/59a4e958_twitter-archive-enhanced/twitter-archive-enhanced.csv)

In [1]:
import pandas as pd

In [2]:
# Load data frame with twitter-archive-enhanced.csv
tweet_archive_data = pd.read_csv('twitter-archive-enhanced.csv')
tweet_archive_data.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,


#### Download the image_predictions.tsv programmatically from this [url](https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv)

In [3]:
import requests
import os

In [4]:
# download the file from the below url
url ='https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
response

<Response [200]>

In [5]:
with open(url.split('/')[-1], mode='wb') as file:

    file.write(response.content)

In [6]:
os.listdir(os.curdir)

['.DS_Store',
 '.git',
 '.ipynb_checkpoints',
 'act_report.html',
 'act_report.ipynb',
 'image-predictions.tsv',
 'README.md',
 'tweet_archive_data_clean.csv',
 'tweet_count_data_clean.csv',
 'tweet_image_data_clean.csv',
 'tweet_json.txt',
 'twitter-archive-enhanced.csv',
 'twitter_archive_master.csv',
 'wrangle_act.html',
 'wrangle_act.ipynb']

In [4]:
# Load dataframe with image-predictions.tsv 
tweet_image_data = pd.read_csv('image-predictions.tsv', sep = '\t')
tweet_image_data.head(2)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True


#### Gather tweet and retweet counts  for the tweet ids in the archive dataset

In [8]:
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# Keys are hidden to comply with Twitter's API terms and conditions
# Two rerun this replace the hidden with your own keys after subscribing to twitter developer account
import tweepy

consumer_key = 'Hidden-Key'
consumer_secret = 'Hidden-Key'
access_token = 'Hidden-Key'
access_secret = 'Hidden-Key'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

In [5]:
import json
from timeit import default_timer as timer

In [6]:
# Tweet ids in the twitter-archive-enhanced.csv file
tweet_ids = tweet_archive_data.tweet_id.values
len(tweet_ids)

2356

In [11]:
# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        # print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            # print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            # print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

1891.9851507429994
{888202515573088257: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 873697596434513921: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 872668790621863937: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 869988702071779329: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 866816280283807744: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 861769973181624320: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 845459076796616705: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 842892208864923648: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 837012587749474308: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 827228250799742977: TweepError([{'code': 144, 'message': 'No status found with that ID.'}],), 812747805718642688: TweepError([{'code':

In [12]:
# how many twitter ids the data retreival failed
len(fails_dict)

16

In [7]:
# read json data from tweet_json.txt and load into a dataframe
tweet_count_list = []
with open('tweet_json.txt') as json_file:
    for line in json_file:
        json_data = json.loads(line)
        tweet_id = json_data['id']
        retweet_count = json_data['retweet_count']
        favorite_count = json_data['favorite_count']
        tweet_count_list.append({'tweet_id': tweet_id,
                        'retweet_count': int(retweet_count),
                        'favorite_count': int(favorite_count)
                        })



tweet_count_data = pd.DataFrame(tweet_count_list, columns = ['tweet_id', 'retweet_count', 'favorite_count'])
        

In [8]:
tweet_count_data.head(2)

Unnamed: 0,tweet_id,retweet_count,favorite_count
0,892420643555336193,8257,37848
1,892177421306343426,6101,32523


# Assess

### Visual Assessment
#### For tweet_archive_data

In [9]:
tweet_archive_data

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,
5,891087950875897856,,,2017-07-29 00:08:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a majestic great white breaching ...,,,,https://twitter.com/dog_rates/status/891087950...,13,10,,,,,
6,890971913173991426,,,2017-07-28 16:27:12 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He enjoys ice cream so much he gets ...,,,,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",13,10,Jax,,,,
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you watch your owner call another dog a g...,,,,https://twitter.com/dog_rates/status/890729181...,13,10,,,,,
8,890609185150312448,,,2017-07-27 16:25:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoey. She doesn't want to be one of th...,,,,https://twitter.com/dog_rates/status/890609185...,13,10,Zoey,,,,
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,,,,https://twitter.com/dog_rates/status/890240255...,14,10,Cassie,doggo,,,


In [10]:
tweet_archive_data.head(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [11]:
tweet_archive_data.tail(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2351,666049248165822465,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a 1949 1st generation vulpix. Enj...,,,,https://twitter.com/dog_rates/status/666049248...,5,10,,,,,
2352,666044226329800704,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a purebred Piers Morgan. Loves to Netf...,,,,https://twitter.com/dog_rates/status/666044226...,6,10,a,,,,
2353,666033412701032449,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,,
2354,666029285002620928,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,,
2355,666020888022790149,,,2015-11-15 22:32:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a Japanese Irish Setter. Lost eye...,,,,https://twitter.com/dog_rates/status/666020888...,8,10,,,,,


In [12]:
tweet_archive_data.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1782,677687604918272002,,,2015-12-18 03:11:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This was Cindy's face when she heard Susan for...,,,,https://twitter.com/dog_rates/status/677687604...,11,10,,,,,
944,752682090207055872,,,2016-07-12 01:52:49 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Nothing better than a doggo and a sunset. 10/1...,,,,https://twitter.com/dog_rates/status/752682090...,10,10,,doggo,,,
507,812709060537683968,,,2016-12-24 17:18:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Brandi and Harley. They are practicing...,,,,https://twitter.com/dog_rates/status/812709060...,12,10,Brandi,,,,
461,817536400337801217,,,2017-01-07 01:00:41 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to Eugene &amp; Patti Melt. No matte...,,,,https://twitter.com/dog_rates/status/817536400...,12,10,Eugene,,,,
1926,674053186244734976,,,2015-12-08 02:29:37 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Stanley. Yes he is aware of the spoon'...,,,,https://twitter.com/dog_rates/status/674053186...,10,10,Stanley,,,,


##### Upon just visual observation we can see that
##### 1. we have 17 observations in this data set
##### 2. There are NaN values in in_reply_to_status_id , in_reply_to_user_id , retweeted_status_id, retweeted_status_user_id and retweeted_status_timestamp columns. 
##### 3. Column "name" seem to represent "None" values and incorrect values like " a,the, an, etc"
##### 4. Some records have a denominator value different from 10
##### 5. Last four columns seem to be holding dog breed type and this is structurally untidy. 

### Programmatic Assessment
#### For tweet_archive_data

In [13]:
tweet_archive_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

##### tweet_archive_data has 17 columns and 2356 observations, tweet_id seems to be the unique Id
##### in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id , retweeted_status_user_id, retweeted_status_timestamp columns has very few non null values.

##### timestamp column is not represented as date.

##### Let's see if there are any duplicate records based on tweet_id

In [14]:
tweet_archive_data[tweet_archive_data.tweet_id.duplicated()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


#### No duplicates in the table for tweet_id

In [15]:
tweet_archive_data.tweet_id.nunique()

2356

##### tweet_id is the unique id and there are 2356 unique tweet_ids for 2356 observations in the tweet_archive_data 


##### Let's take a closer look at columns where we have majority NaN Values

In [16]:
tweet_archive_data.isnull().sum()

tweet_id                         0
in_reply_to_status_id         2278
in_reply_to_user_id           2278
timestamp                        0
source                           0
text                             0
retweeted_status_id           2175
retweeted_status_user_id      2175
retweeted_status_timestamp    2175
expanded_urls                   59
rating_numerator                 0
rating_denominator               0
name                             0
doggo                            0
floofer                          0
pupper                           0
puppo                            0
dtype: int64

##### in_reply_to_status_id  has 2278 NaN
##### in_reply_to_user_id has 2278 NaN
##### retweeted_status_id has 2175 NaN
##### retweeted_status_user_id 2175 NaN
##### retweeted_status_timestamp 2175 NaN
##### Since these columns have majority values as Null these columns and we currently do  not have a way to find the missing values we will be  deleting these columns from the data frame.


##### Let's take a closer look at the values populated in the "name" column

In [17]:
tweet_archive_data.name.sample(2)

847     Colby
1526    Phred
Name: name, dtype: object

In [18]:
tweet_archive_data.name.value_counts()

None        745
a            55
Charlie      12
Lucy         11
Cooper       11
Oliver       11
Lola         10
Tucker       10
Penny        10
Bo            9
Winston       9
Sadie         8
the           8
Daisy         7
an            7
Buddy         7
Toby          7
Bailey        7
Rusty         6
Stanley       6
Koda          6
Dave          6
Jax           6
Bella         6
Leo           6
Scout         6
Oscar         6
Jack          6
Milo          6
Oakley        5
           ... 
Harvey        1
Callie        1
Ozzie         1
Pluto         1
Clarq         1
Lorelei       1
Tom           1
Godi          1
Iroh          1
Sonny         1
Jazz          1
Opie          1
Meatball      1
Harrison      1
Tycho         1
Keet          1
Chadrick      1
Darla         1
Wishes        1
Tonks         1
Shooter       1
Olaf          1
Rudy          1
Glacier       1
Bloo          1
Sobe          1
Chesney       1
Emmie         1
Farfle        1
Chubbs        1
Name: name, Length: 957,

##### Some of the values in the name column are "None","the","such" ,"a" and "an". These dont look like real names of the dogs. Majority of the rows ( 745) have None populated. Does this mean these rows dont have a Dog name available and data is entered as None.

##### All the valid values for the name are starting with Uppercase letter.
##### All the invalid values are starting with lowercase
##### let's take a closer look at the invalid name values and their counts

In [19]:
tweet_archive_data.loc[tweet_archive_data['name'].str.islower()].name.value_counts()

a               55
the              8
an               7
very             5
quite            4
one              4
just             4
actually         2
getting          2
mad              2
not              2
space            1
infuriating      1
life             1
all              1
his              1
officially       1
light            1
my               1
old              1
this             1
such             1
unacceptable     1
incredibly       1
by               1
Name: name, dtype: int64

In [20]:
tweet_archive_data.loc[tweet_archive_data['name'].str.islower()].name.count()

109

##### There are 109 rows that have invalid values for name
##### There are 745 rows that have name value as None


##### Let's  see where the tweets are originated from

In [21]:
tweet_archive_data.source.value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64

##### Looks like the majority of the tweets are originated from an iphone

##### Let's look at the tweet text data

In [22]:
tweet_archive_data.text.nunique()

2356

In [48]:
tweet_archive_data.text.value_counts

<bound method IndexOpsMixin.value_counts of 0       This is Phineas. He's a mystical boy. Only eve...
1       This is Tilly. She's just checking pup on you....
2       This is Archie. He is a rare Norwegian Pouncin...
3       This is Darla. She commenced a snooze mid meal...
4       This is Franklin. He would like you to stop ca...
5       Here we have a majestic great white breaching ...
6       Meet Jax. He enjoys ice cream so much he gets ...
7       When you watch your owner call another dog a g...
8       This is Zoey. She doesn't want to be one of th...
9       This is Cassie. She is a college pup. Studying...
10      This is Koda. He is a South Australian decksha...
11      This is Bruno. He is a service shark. Only get...
12      Here's a puppo that seems to be on the fence a...
13      This is Ted. He does his best. Sometimes that'...
14      This is Stuart. He's sporting his favorite fan...
15      This is Oliver. You're witnessing one of his m...
16      This is Jim. He foun

In [23]:
tweet_archive_data[['name','text']]

Unnamed: 0,name,text
0,Phineas,This is Phineas. He's a mystical boy. Only eve...
1,Tilly,This is Tilly. She's just checking pup on you....
2,Archie,This is Archie. He is a rare Norwegian Pouncin...
3,Darla,This is Darla. She commenced a snooze mid meal...
4,Franklin,This is Franklin. He would like you to stop ca...
5,,Here we have a majestic great white breaching ...
6,Jax,Meet Jax. He enjoys ice cream so much he gets ...
7,,When you watch your owner call another dog a g...
8,Zoey,This is Zoey. She doesn't want to be one of th...
9,Cassie,This is Cassie. She is a college pup. Studying...


##### It looks like name is extracted from the text value.

#### let's see sample records for the text value where the name is populated a None

In [24]:
tweet_archive_data.query('name=="None"')[['text']].sample(5)

Unnamed: 0,text
101,RT @loganamnosis: Penelope here is doing me qu...
568,"RT @ChinoChinako: They're good products, Brent..."
1837,"""Yes hello I'ma just snag this here toasted ba..."
1678,We normally don't rate bears but this one seem...
37,Here we have a corgi undercover as a malamute....


##### It looks like the text does not contain the name of the dog, hence the parser couldnt not pick up any value for the dog name

##### Let's take a look at expanded_urls data

In [25]:
tweet_archive_data.expanded_urls.sample(5)

2146    https://twitter.com/dog_rates/status/669923323...
1837    https://twitter.com/dog_rates/status/676089483...
897     https://twitter.com/dog_rates/status/759047813...
1691    https://twitter.com/dog_rates/status/681320187...
810     https://twitter.com/dog_rates/status/771380798...
Name: expanded_urls, dtype: object

In [26]:
tweet_archive_data.expanded_urls.nunique()

2218

##### There are 2218 unique expanded urls!

In [27]:
tweet_archive_data.expanded_urls.value_counts()

https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1,https://twitter.com/dog_rates/status/740373189193256964/photo/1                                              2
https://twitter.com/dog_rates/status/786963064373534720/photo/1                                                                                                                                                                                                                                              2
https://twitter.com/dog_rates/status/809920764300447744/photo/1                                                                                                                                                                                                                                              2
https://twitter.com/dog_rates/status/819227688460238848/photo/1                            

#### Let's look at the rating_denominator and rating_numerator values

In [28]:
tweet_archive_data.rating_denominator.value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

##### Majority of the data , 2333 rows have a valid denominator of 10

In [29]:
tweet_archive_data.query("rating_denominator != 10")

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
313,835246439529840640,8.35246e+17,26259580.0,2017-02-24 21:54:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@jonnysun @Lin_Manuel ok jomny I know you're e...,,,,,960,0,,,,,
342,832088576586297345,8.320875e+17,30582080.0,2017-02-16 04:45:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@docmisterio account started on 11/15/15,,,,,11,15,,,,,
433,820690176645140481,,,2017-01-15 17:52:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",The floofs have been released I repeat the flo...,,,,https://twitter.com/dog_rates/status/820690176...,84,70,,,,,
516,810984652412424192,,,2016-12-19 23:06:23 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Sam. She smiles 24/7 &amp; secretly aspir...,,,,"https://www.gofundme.com/sams-smile,https://tw...",24,7,Sam,,,,
784,775096608509886464,,,2016-09-11 22:20:06 +0000,"<a href=""http://twitter.com/download/iphone"" r...","RT @dog_rates: After so many requests, this is...",7.403732e+17,4196984000.0,2016-06-08 02:41:38 +0000,https://twitter.com/dog_rates/status/740373189...,9,11,,,,,
902,758467244762497024,,,2016-07-28 01:00:57 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Why does this never happen at my front door......,,,,https://twitter.com/dog_rates/status/758467244...,165,150,,,,,
1068,740373189193256964,,,2016-06-08 02:41:38 +0000,"<a href=""http://twitter.com/download/iphone"" r...","After so many requests, this is Bretagne. She ...",,,,https://twitter.com/dog_rates/status/740373189...,9,11,,,,,
1120,731156023742988288,,,2016-05-13 16:15:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to this unbelievably well behaved sq...,,,,https://twitter.com/dog_rates/status/731156023...,204,170,this,,,,
1165,722974582966214656,,,2016-04-21 02:25:47 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy 4/20 from the squad! 13/10 for all https...,,,,https://twitter.com/dog_rates/status/722974582...,4,20,,,,,
1202,716439118184652801,,,2016-04-03 01:36:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bluebert. He just saw that both #Final...,,,,https://twitter.com/dog_rates/status/716439118...,50,50,Bluebert,,,,


##### There are  23 rows where rating_denominator is not equal to 10

##### Lets take a closer look at the rating_numerator

In [30]:
tweet_archive_data.rating_numerator.value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

#####  According to WeRateDogs,dog ratings almost always have a denominator of 10. The numerators, though? Almost always greater than 10. 11/10, 12/10, 13/10, etc. Why? Because ["they're good dogs Brent."](https://knowyourmeme.com/memes/theyre-good-dogs-brent) WeRateDogs has over 4 million followers and has received international media coverage.

In [31]:
tweet_archive_data.query("rating_numerator > 14").sort_values('rating_numerator')

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
285,838916489579200512,,,2017-03-07 00:57:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @KibaDva: I collected all the good dogs!! 1...,8.38906e+17,811740800.0,2017-03-07 00:15:46 +0000,https://twitter.com/KibaDva/status/83890598062...,15,10,,,,,
291,838085839343206401,8.380855e+17,2894131000.0,2017-03-04 17:56:49 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@bragg6of8 @Andy_Pace_ we are still looking fo...,,,,,15,10,,,,,
55,881633300179243008,8.81607e+17,47384430.0,2017-07-02 21:58:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@roushfenway These are good dogs but 17/10 is ...,,,,,17,10,,,,,
1663,682808988178739200,6.827884e+17,4196984000.0,2016-01-01 06:22:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...","I'm aware that I could've said 20/16, but here...",,,,,20,16,,,,,
516,810984652412424192,,,2016-12-19 23:06:23 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Sam. She smiles 24/7 &amp; secretly aspir...,,,,"https://www.gofundme.com/sams-smile,https://tw...",24,7,Sam,,,,
1712,680494726643068929,,,2015-12-25 21:06:00 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have uncovered an entire battalion of ...,,,,https://twitter.com/dog_rates/status/680494726...,26,10,,,,,
763,778027034220126208,,,2016-09-20 00:24:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Sophie. She's a Jubilant Bush Pupper. ...,,,,https://twitter.com/dog_rates/status/778027034...,27,10,Sophie,,,pupper,
1433,697463031882764288,,,2016-02-10 16:51:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy Wednesday here's a bucket of pups. 44/40...,,,,https://twitter.com/dog_rates/status/697463031...,44,40,,,,,
1274,709198395643068416,,,2016-03-14 02:04:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...","From left to right:\nCletus, Jerome, Alejandro...",,,,https://twitter.com/dog_rates/status/709198395...,45,50,,,,,
1202,716439118184652801,,,2016-04-03 01:36:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bluebert. He just saw that both #Final...,,,,https://twitter.com/dog_rates/status/716439118...,50,50,Bluebert,,,,


##### There are 28 rows with rating_numerator greater than 14 ..... are these exceeding the ratings for "They are good dog brent" these may be erroneous.

##### Lets take a look at the rows where rating_numerator is greater than 14 and rating_denominator is not equal to 10

In [32]:
tweet_archive_data.query("rating_numerator > 14 & rating_denominator != 10").sort_values('rating_numerator')

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1663,682808988178739200,6.827884e+17,4196984000.0,2016-01-01 06:22:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...","I'm aware that I could've said 20/16, but here...",,,,,20,16,,,,,
516,810984652412424192,,,2016-12-19 23:06:23 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Sam. She smiles 24/7 &amp; secretly aspir...,,,,"https://www.gofundme.com/sams-smile,https://tw...",24,7,Sam,,,,
1433,697463031882764288,,,2016-02-10 16:51:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy Wednesday here's a bucket of pups. 44/40...,,,,https://twitter.com/dog_rates/status/697463031...,44,40,,,,,
1274,709198395643068416,,,2016-03-14 02:04:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...","From left to right:\nCletus, Jerome, Alejandro...",,,,https://twitter.com/dog_rates/status/709198395...,45,50,,,,,
1202,716439118184652801,,,2016-04-03 01:36:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bluebert. He just saw that both #Final...,,,,https://twitter.com/dog_rates/status/716439118...,50,50,Bluebert,,,,
1351,704054845121142784,,,2016-02-28 21:25:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a whole flock of puppers. 60/50 I'll ...,,,,https://twitter.com/dog_rates/status/704054845...,60,50,a,,,,
1254,710658690886586372,,,2016-03-18 02:46:49 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a brigade of puppers. All look very pre...,,,,https://twitter.com/dog_rates/status/710658690...,80,80,,,,,
433,820690176645140481,,,2017-01-15 17:52:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",The floofs have been released I repeat the flo...,,,,https://twitter.com/dog_rates/status/820690176...,84,70,,,,,
1843,675853064436391936,,,2015-12-13 01:41:41 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have an entire platoon of puppers. Tot...,,,,https://twitter.com/dog_rates/status/675853064...,88,80,,,,,
1228,713900603437621249,,,2016-03-27 01:29:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Happy Saturday here's 9 puppers on a bench. 99...,,,,https://twitter.com/dog_rates/status/713900603...,99,90,,,,,


#####  There are 16 records that have rating_numerator greater than 14 and rating_denominator not equal to 10
##### These records definitely look errorneous as the ratings do not make any sense

#### Let's take a look at the records where rating_numerator is less than 10

In [33]:
tweet_archive_data.query("rating_numerator < 14").sort_values('rating_numerator')

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1016,746906459439529985,7.468859e+17,4.196984e+09,2016-06-26 03:22:31 +0000,"<a href=""http://twitter.com/download/iphone"" r...","PUPDATE: can't see any. Even if I could, I cou...",,,,https://twitter.com/dog_rates/status/746906459...,0,10,,,,,
315,835152434251116546,,,2017-02-24 15:40:31 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you're so blinded by your systematic plag...,,,,https://twitter.com/dog_rates/status/835152434...,0,10,,,,,
2261,667549055577362432,,,2015-11-20 03:44:31 +0000,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Never seen dog like this. Breathes heavy. Tilt...,,,,https://twitter.com/dog_rates/status/667549055...,1,10,,,,,
2091,670783437142401025,,,2015-11-29 01:56:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Flamboyant pup here. Probably poisonous. Won't...,,,,https://twitter.com/dog_rates/status/670783437...,1,10,,,,,
2038,671550332464455680,6.715449e+17,4.196984e+09,2015-12-01 04:44:10 +0000,"<a href=""http://twitter.com/download/iphone"" r...",After 22 minutes of careful deliberation this ...,,,,,1,10,,,,,
1446,696490539101908992,6.964887e+17,4.196984e+09,2016-02-08 00:27:39 +0000,"<a href=""http://twitter.com/download/iphone"" r...",After reading the comments I may have overesti...,,,,,1,10,,,,,
2338,666104133288665088,,,2015-11-16 04:02:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Not familiar with this breed. No tail (weird)....,,,,https://twitter.com/dog_rates/status/666104133...,1,10,,,,,
2335,666287406224695296,,,2015-11-16 16:11:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is an Albanian 3 1/2 legged Episcopalian...,,,,https://twitter.com/dog_rates/status/666287406...,1,2,an,,,,
605,798576900688019456,,,2016-11-15 17:22:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Not familiar with this breed. N...,6.661041e+17,4.196984e+09,2015-11-16 04:02:55 +0000,https://twitter.com/dog_rates/status/666104133...,1,10,,,,,
1869,675153376133427200,,,2015-12-11 03:21:23 +0000,"<a href=""http://twitter.com/download/iphone"" r...",What kind of person sends in a picture without...,,,,https://twitter.com/dog_rates/status/675153376...,1,10,,,,,


#### Upon observing the records manually in the file we can find that some of the rating_numerator values can also be found in text and they are extracted errorneously. These records can be found using the below code.

In [125]:
tweet_archive_data[tweet_archive_data.text.str.contains(r'(\d+\.\d*\/\d+)')][['text','rating_numerator']]

  if __name__ == '__main__':


Unnamed: 0,text,rating_numerator
45,This is Bella. She hopes her smile made you sm...,5
340,"RT @dog_rates: This is Logan, the Chow who liv...",75
695,"This is Logan, the Chow who lived. He solemnly...",75
763,This is Sophie. She's a Jubilant Bush Pupper. ...,27
1689,I've been told there's a slight possibility he...,5
1712,Here we have uncovered an entire battalion of ...,26


##### There are six records in the tweet_archive_data where rating numerator is extracted incorrectly

##### Let's take a look at the last four columns of tweet_archive_data dataframe

In [43]:
tweet_archive_data[['doggo','floofer','pupper','puppo']].head(10)

Unnamed: 0,doggo,floofer,pupper,puppo
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,doggo,,,


##### Looks like these four columns represent data for dog breed , which can be represented in one column called dog_breed

### Visual Assessment
#### For tweet_image_data

In [46]:
tweet_image_data

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
5,666050758794694657,https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg,1,Bernese_mountain_dog,0.651137,True,English_springer,0.263788,True,Greater_Swiss_Mountain_dog,0.016199,True
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,0.045885,False,terrapin,0.017885,False
7,666055525042405380,https://pbs.twimg.com/media/CT5N9tpXIAAifs1.jpg,1,chow,0.692517,True,Tibetan_mastiff,0.058279,True,fur_coat,0.054449,False
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,shopping_cart,0.962465,False,shopping_basket,0.014594,False,golden_retriever,0.007959,True
9,666058600524156928,https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg,1,miniature_poodle,0.201493,True,komondor,0.192305,True,soft-coated_wheaten_terrier,0.082086,True


##### Based on just visual assessmet we can see that there are 12 observations in this table and it holds the images url for the images in each tweet and the dog breed predictions using the image recognition software. It looks like this table is pretty clean, no NaN values found so far. Although the column names are not descriptive.

### Programmatic Assessment
#### For tweet_image_date

In [45]:
tweet_image_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


###### There are 12 columns and 2075 observations
###### Column names are not very descriptive.
######  None of the columns have NaN values

###### lets see if there are any duplicate tweet_ids

In [47]:
tweet_image_data[tweet_image_data.tweet_id.duplicated()]

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog


###### There are no duplicate tweet_ids so these are all unique tweets or retweets may be
###### Lets see if there are duplicates in image urls, i.e if a image is used my multiple tweets.

In [48]:
tweet_image_data[tweet_image_data.jpg_url.duplicated(keep=False)].sort_values('jpg_url')

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
480,675354435921575936,https://pbs.twimg.com/ext_tw_video_thumb/67535...,1,upright,0.303415,False,golden_retriever,0.181351,True,Brittany_spaniel,0.162084,True
1297,752309394570878976,https://pbs.twimg.com/ext_tw_video_thumb/67535...,1,upright,0.303415,False,golden_retriever,0.181351,True,Brittany_spaniel,0.162084,True
1864,842892208864923648,https://pbs.twimg.com/ext_tw_video_thumb/80710...,1,Chihuahua,0.505370,True,Pomeranian,0.120358,True,toy_terrier,0.077008,True
1641,807106840509214720,https://pbs.twimg.com/ext_tw_video_thumb/80710...,1,Chihuahua,0.505370,True,Pomeranian,0.120358,True,toy_terrier,0.077008,True
1703,817181837579653120,https://pbs.twimg.com/ext_tw_video_thumb/81596...,1,Tibetan_mastiff,0.506312,True,Tibetan_terrier,0.295690,True,otterhound,0.036251,True
1691,815966073409433600,https://pbs.twimg.com/ext_tw_video_thumb/81596...,1,Tibetan_mastiff,0.506312,True,Tibetan_terrier,0.295690,True,otterhound,0.036251,True
1705,817423860136083457,https://pbs.twimg.com/ext_tw_video_thumb/81742...,1,ice_bear,0.336200,False,Samoyed,0.201358,True,Eskimo_dog,0.186789,True
1858,841833993020538882,https://pbs.twimg.com/ext_tw_video_thumb/81742...,1,ice_bear,0.336200,False,Samoyed,0.201358,True,Eskimo_dog,0.186789,True
1715,819004803107983360,https://pbs.twimg.com/media/C12whDoVEAALRxa.jpg,1,standard_poodle,0.351308,True,toy_poodle,0.271929,True,Tibetan_terrier,0.094759,True
1718,819015337530290176,https://pbs.twimg.com/media/C12whDoVEAALRxa.jpg,1,standard_poodle,0.351308,True,toy_poodle,0.271929,True,Tibetan_terrier,0.094759,True


###### Interesting to see image url is shared by more than one tweet, what does this mean?
###### Since image predictions data is created for the twitter ids in tweet_archive_data, lets take a look at the tweet archive data for a sample of these tweets sharing the same image url.

In [49]:
tweet_archive_data.query("tweet_id in (675354435921575936,752309394570878976)")

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
949,752309394570878976,,,2016-07-11 01:11:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Everyone needs to watch this. 1...,6.753544e+17,4196984000.0,2015-12-11 16:40:19 +0000,https://twitter.com/dog_rates/status/675354435...,13,10,,,,,
1865,675354435921575936,,,2015-12-11 16:40:19 +0000,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Everyone needs to watch this. 13/10 https://t....,,,,https://twitter.com/dog_rates/status/675354435...,13,10,,,,,


In [50]:
tweet_archive_data.query("tweet_id in (711694788429553666,761371037149827077)")

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
872,761371037149827077,,,2016-08-05 01:19:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Oh. My. God. 13/10 magical af h...,7.116948e+17,4196984000.0,2016-03-20 23:23:54 +0000,https://twitter.com/dog_rates/status/711694788...,13,10,,,,,
1246,711694788429553666,,,2016-03-20 23:23:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Oh. My. God. 13/10 magical af https://t.co/Ezu...,,,,https://twitter.com/dog_rates/status/711694788...,13,10,,,,,


###### If you look at the retweeted_status_id , it holds the original tweet_id if it is a retweet. This is one way to recognize retweets. 

### Visual Assessment
#### For tweet_count_data

In [51]:
tweet_count_data

Unnamed: 0,tweet_id,retweet_count,favorite_count
0,892420643555336193,8257,37848
1,892177421306343426,6101,32523
2,891815181378084864,4039,24483
3,891689557279858688,8400,41202
4,891327558926688256,9106,39378
5,891087950875897856,3022,19795
6,890971913173991426,1999,11570
7,890729181411237888,18341,63867
8,890609185150312448,4157,27212
9,890240255349198849,7172,31193


##### Based on visual assement we can see that this data set consists for three observations tweet_id, retweet_count and favorite_count
##### Looks like there aren't any NaN values

### Programmatic Assessment
#### For tweet_count_data

In [52]:
tweet_count_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2340 entries, 0 to 2339
Data columns (total 3 columns):
tweet_id          2340 non-null int64
retweet_count     2340 non-null int64
favorite_count    2340 non-null int64
dtypes: int64(3)
memory usage: 54.9 KB


###### There are 2340 observations and three columns. There are no NaN values

## Quality

#### twitter-archive-enhanced.csv

###### 1. in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id , retweeted_status_user_id, retweeted_status_timestamp columns have majority null values.
###### 2. retweeted_status_id  is populated for retweets. To prepare a simple dataset to join with tweet counts lets remove the retweets.
###### 3. in_reply_to_status_id is populated for tweet replies.Lets remove these tweet replies.
###### 4. name column has majority of the rows ( 745) have None populated.There are also 109 rows that have invalid values for name with values like ( 'the','a','an','such' etc).
###### 5. Invalid values in ratings numerator and rating denominator. There are 16 records that have ratings numerator greater than 14 and ratings denominator not equal to 10.
###### 6. There are  some records with decimal values for rating_numerator. These values were incorrectly extracted from the text value.
###### 7. timestamp column is not in date type.
###### 8. When one of the columns in columns doggo, floofer, pupper, puppo is populated, other columns are populated with a value "None" . This should be represented as a Null value.
###### 9. Source column data is not very useful for the analysis. Delete this column.

#### image-predictions.tsv

###### 9. column names must be renamed to something more descriptive about the data it holds
###### 10. Some breed name values populated in p1_dog are starting with upper case and some are starting with lower case letter.


### Tidiness 

#### twitter-archive-enhanced.csv

###### 1. Dog breed value can be restructured to be stored in one column instead of four different columns.

#### image-predictions.tsv

###### 2.  Re structure  or drop the columns that are unnecessary. such as img_num,p2, p2_conf,p2_dog,p3,p3_conf,p3_dog 

###### 3. twitter_archive_data has 2356 rows and twitter_image_data has 2075 rows. After removing the retweets we need to merge the data in these datasets with tweet_counts to have the data in single data frame



# Clean

#### Define
1. Select all the rows that are retweets
2. Delete all rows that are retweets
3. select the rows that are tweet replies
4. Delete the rows that are tweet replies
3. Delete columns in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id , retweeted_status_user_id, retweeted_status_timestamp after removing the retweets

#### Code

In [266]:
# Create a copy of all the origina data frames and perform cleaning operations on the copies
# we do not want to loose the original data set
tweet_archive_data_clean = tweet_archive_data.copy()
tweet_image_data_clean = tweet_image_data.copy()
tweet_count_data_clean = tweet_count_data.copy()


In [267]:
tweet_archive_data_clean.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,


In [268]:
# creating boolean series True for retweeted_status_id isnull 
# original tweets have retweeted_status_id as null
retweet_is_null = pd.isnull(tweet_archive_data_clean['retweeted_status_id'])
tweet_archive_data_clean = tweet_archive_data_clean[retweet_is_null]

In [269]:
# creating boolean series True for reply_status_id isnull 
# original tweets have reply_status_id as null
reply_is_null = pd.isnull(tweet_archive_data_clean['in_reply_to_status_id'])
tweet_archive_data_clean = tweet_archive_data_clean[reply_is_null]

In [270]:
# Drop columns in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,
# retweeted_status_user_id, retweeted_status_timestamp 

tweet_archive_data_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id',
                            'retweeted_status_id','retweeted_status_user_id',
                            'retweeted_status_timestamp'],inplace = True, axis=1)

#### Test

In [271]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id              2097 non-null int64
timestamp             2097 non-null object
source                2097 non-null object
text                  2097 non-null object
expanded_urls         2094 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  2097 non-null object
doggo                 2097 non-null object
floofer               2097 non-null object
pupper                2097 non-null object
puppo                 2097 non-null object
dtypes: int64(3), object(9)
memory usage: 213.0+ KB


#### Define
name column has majority of the rows ( 745) have None populated.There are also 109 rows that have invalid values for name with values like ( 'the','a','an','such' etc).
1. Gather the values in the name column that start with lowercase. These seem to be invalid values. Store them in a list.
2. Replace these values with null values
3. Replace None with Null values


#### Code

In [272]:
# Create a boolean series for name starting with lowercase letter
name_lower_case = tweet_archive_data_clean['name'].str[0].str.islower()

In [273]:
import numpy as np

In [274]:
# store these name values into an array
invalid_names = tweet_archive_data_clean[name_lower_case].name.values

In [275]:
invalid_names

array(['such', 'a', 'quite', 'quite', 'not', 'one', 'incredibly', 'a', 'a',
       'very', 'my', 'one', 'not', 'his', 'one', 'a', 'a', 'a', 'an',
       'very', 'actually', 'a', 'just', 'getting', 'mad', 'very', 'this',
       'unacceptable', 'all', 'a', 'old', 'a', 'infuriating', 'a', 'a',
       'a', 'an', 'a', 'a', 'very', 'getting', 'just', 'a', 'the', 'the',
       'actually', 'by', 'a', 'officially', 'a', 'the', 'the', 'a', 'a',
       'a', 'a', 'life', 'a', 'one', 'a', 'a', 'a', 'light', 'just',
       'space', 'a', 'the', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
       'an', 'a', 'the', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',
       'a', 'quite', 'a', 'an', 'a', 'an', 'the', 'the', 'a', 'a', 'an',
       'a', 'a', 'a', 'a'], dtype=object)

In [276]:
# replace the above values in the name column with null values
tweet_archive_data_clean.name.replace(invalid_names,np.nan, inplace = True)

In [277]:
# replace the "None" value name column with null values
tweet_archive_data_clean.name.replace('None',np.nan, inplace = True)

#### Test

In [278]:
tweet_archive_data_clean.name.unique()

array(['Phineas', 'Tilly', 'Archie', 'Darla', 'Franklin', nan, 'Jax',
       'Zoey', 'Cassie', 'Koda', 'Bruno', 'Ted', 'Stuart', 'Oliver', 'Jim',
       'Zeke', 'Ralphus', 'Gerald', 'Jeffrey', 'Canela', 'Maya', 'Mingus',
       'Derek', 'Roscoe', 'Waffles', 'Jimbo', 'Maisey', 'Earl', 'Lola',
       'Kevin', 'Yogi', 'Noah', 'Bella', 'Grizzwald', 'Rusty', 'Gus',
       'Stanley', 'Alfy', 'Koko', 'Rey', 'Gary', 'Elliot', 'Louis',
       'Jesse', 'Romeo', 'Bailey', 'Duddles', 'Jack', 'Steven', 'Beau',
       'Snoopy', 'Shadow', 'Emmy', 'Aja', 'Penny', 'Dante', 'Nelly',
       'Ginger', 'Benedict', 'Venti', 'Goose', 'Nugget', 'Cash', 'Jed',
       'Sebastian', 'Sierra', 'Monkey', 'Harry', 'Kody', 'Lassie', 'Rover',
       'Napolean', 'Boomer', 'Cody', 'Rumble', 'Clifford', 'Dewey',
       'Scout', 'Gizmo', 'Walter', 'Cooper', 'Harold', 'Shikha', 'Lili',
       'Jamesy', 'Coco', 'Sammy', 'Meatball', 'Paisley', 'Albus',
       'Neptune', 'Belle', 'Quinn', 'Zooey', 'Dave', 'Jersey', 'Hobbes',


#### Define
1. Change the timestamp column to date data type

#### Code

In [281]:
tweet_archive_data_clean.timestamp = pd.to_datetime( tweet_archive_data_clean.timestamp,infer_datetime_format=True)

In [282]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id              2097 non-null int64
timestamp             2097 non-null datetime64[ns]
source                2097 non-null object
text                  2097 non-null object
expanded_urls         2094 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  1390 non-null object
doggo                 2097 non-null object
floofer               2097 non-null object
pupper                2097 non-null object
puppo                 2097 non-null object
dtypes: datetime64[ns](1), int64(3), object(8)
memory usage: 213.0+ KB


#### Define
There are 23 rows where rating denominator is not equal to 10
 For consistency purposes I onyl want to keep the records where denominator is  equal to 10.

1. Filter out the rows where ratings denominator is not equal to 10

#### Code

In [283]:
tweet_archive_data_clean = tweet_archive_data_clean[tweet_archive_data_clean.rating_denominator == 10]

#### Test

In [284]:
tweet_archive_data_clean.rating_denominator.unique()

array([10])

#### Define
1. Extract decimal rating_numerator from the text column and update the rating_numerator to reflect correct values.


#### Code

In [285]:
tweet_archive_data_clean[tweet_archive_data_clean.text.str.contains(r"(\d+\.\d*\/\d+)")][['text', 'rating_numerator']]

  if __name__ == '__main__':


Unnamed: 0,text,rating_numerator
45,This is Bella. She hopes her smile made you sm...,5
695,"This is Logan, the Chow who lived. He solemnly...",75
763,This is Sophie. She's a Jubilant Bush Pupper. ...,27
1712,Here we have uncovered an entire battalion of ...,26


In [286]:
rows_to_update = tweet_archive_data_clean[tweet_archive_data_clean.text.str.contains(r"(\d+\.\d*\/\d+)")].index.values
cols_to_update = ['rating_numerator']
values = tweet_archive_data_clean[tweet_archive_data_clean.text.str.contains(r"(\d+\.\d*\/\d+)")]['text'].str.extract(r"(\d+\.\d+)").values
tweet_archive_data_clean.loc[rows_to_update, cols_to_update] = values

  if __name__ == '__main__':
  app.launch_new_instance()
  app.launch_new_instance()


#### Test

In [287]:
tweet_archive_data_clean[tweet_archive_data_clean.text.str.contains(r"(\d+\.\d*\/\d+)")][['text','rating_numerator','rating_denominator']]

  if __name__ == '__main__':


Unnamed: 0,text,rating_numerator,rating_denominator
45,This is Bella. She hopes her smile made you sm...,13.5,10
695,"This is Logan, the Chow who lived. He solemnly...",9.75,10
763,This is Sophie. She's a Jubilant Bush Pupper. ...,11.27,10
1712,Here we have uncovered an entire battalion of ...,11.26,10


In [288]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2080 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id              2080 non-null int64
timestamp             2080 non-null datetime64[ns]
source                2080 non-null object
text                  2080 non-null object
expanded_urls         2077 non-null object
rating_numerator      2080 non-null object
rating_denominator    2080 non-null int64
name                  1387 non-null object
doggo                 2080 non-null object
floofer               2080 non-null object
pupper                2080 non-null object
puppo                 2080 non-null object
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 291.2+ KB


#### rating_numerator column got converted into string object type during the extract/population of decimal  ratings  from the  text column


#### Define
#### convert the rating_number to float data type

##### Code

In [289]:
tweet_archive_data_clean['rating_numerator'] = pd.to_numeric(tweet_archive_data_clean['rating_numerator'])

##### Test

In [290]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2080 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id              2080 non-null int64
timestamp             2080 non-null datetime64[ns]
source                2080 non-null object
text                  2080 non-null object
expanded_urls         2077 non-null object
rating_numerator      2080 non-null float64
rating_denominator    2080 non-null int64
name                  1387 non-null object
doggo                 2080 non-null object
floofer               2080 non-null object
pupper                2080 non-null object
puppo                 2080 non-null object
dtypes: datetime64[ns](1), float64(1), int64(2), object(8)
memory usage: 291.2+ KB


In [291]:
# set options to display floating values to two decimal precision
pd.options.display.float_format = "{:,.2f}".format

#### Define
1. Filter out extreme outliers in Ratings numerator
i.e delete rows where ratings numerator is ( 1776,420,75) any ratings greater than equal to 74

In [292]:
tweet_archive_data_clean.rating_numerator.unique()

array([  1.30000000e+01,   1.20000000e+01,   1.40000000e+01,
         1.35000000e+01,   1.10000000e+01,   6.00000000e+00,
         1.00000000e+01,   0.00000000e+00,   9.75000000e+00,
         5.00000000e+00,   1.12700000e+01,   3.00000000e+00,
         7.00000000e+00,   8.00000000e+00,   9.00000000e+00,
         4.00000000e+00,   1.77600000e+03,   1.12600000e+01,
         2.00000000e+00,   1.00000000e+00,   4.20000000e+02])

#### Code

In [293]:
tweet_archive_data_clean = tweet_archive_data_clean[tweet_archive_data_clean.rating_numerator < 4.20000000e+02]

#### Test

In [294]:
tweet_archive_data_clean.rating_numerator.value_counts()

12.00    486
10.00    436
11.00    413
13.00    287
9.00     152
8.00      98
7.00      51
14.00     38
5.00      33
6.00      32
3.00      19
4.00      15
2.00       9
1.00       4
0.00       1
11.27      1
13.50      1
11.26      1
9.75       1
Name: rating_numerator, dtype: int64

In [295]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2078 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id              2078 non-null int64
timestamp             2078 non-null datetime64[ns]
source                2078 non-null object
text                  2078 non-null object
expanded_urls         2075 non-null object
rating_numerator      2078 non-null float64
rating_denominator    2078 non-null int64
name                  1386 non-null object
doggo                 2078 non-null object
floofer               2078 non-null object
pupper                2078 non-null object
puppo                 2078 non-null object
dtypes: datetime64[ns](1), float64(1), int64(2), object(8)
memory usage: 211.0+ KB


#### Define
Delete source and expanded_urls column

#### Code

In [296]:
tweet_archive_data_clean.drop(['source','expanded_urls'],inplace = True, axis=1)

#### Test

In [297]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2078 entries, 0 to 2355
Data columns (total 10 columns):
tweet_id              2078 non-null int64
timestamp             2078 non-null datetime64[ns]
text                  2078 non-null object
rating_numerator      2078 non-null float64
rating_denominator    2078 non-null int64
name                  1386 non-null object
doggo                 2078 non-null object
floofer               2078 non-null object
pupper                2078 non-null object
puppo                 2078 non-null object
dtypes: datetime64[ns](1), float64(1), int64(2), object(6)
memory usage: 178.6+ KB


##### Tidiness for twitter_archive_data

#### Define

1. Create a new columns called breed_type and populate it using the data in the columns doggo,floofer,pupper and puppo. 


#### Code

In [298]:
tweet_archive_data_clean[['doggo', 'floofer','pupper', 'puppo']].sample(5)

Unnamed: 0,doggo,floofer,pupper,puppo
1596,,,,
1321,,,pupper,
1544,,,,
1494,,,,
1851,,,,


In [299]:
# replace None value with Null
columns = ['doggo', 'floofer','pupper', 'puppo']
for col in columns:
    tweet_archive_data_clean[col].replace('None',np.nan, inplace = True)

In [300]:
tweet_archive_data_clean.query('tweet_id in (855851453814013952,892420643555336193,890240255349198849)')

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,2017-08-01 16:23:56,This is Phineas. He's a mystical boy. Only eve...,13.0,10,Phineas,,,,
9,890240255349198849,2017-07-26 15:59:51,This is Cassie. She is a college pup. Studying...,14.0,10,Cassie,doggo,,,
191,855851453814013952,2017-04-22 18:31:02,Here's a puppo participating in the #ScienceMa...,13.0,10,,doggo,,,puppo


In [301]:
def breed_type(in_cols):
    breed = []
    for col in in_cols:
        if col in in_cols:
            breed.append(col)
    if len(breed)>0:
        return ','.join(breed)
    else:
        return None
    
    

In [302]:
test_df=tweet_archive_data_clean.query('tweet_id in (855851453814013952,892420643555336193,890240255349198849)')

In [303]:
test_df[columns].apply(breed_type,axis=1 )

0             None
9            doggo
191    doggo,puppo
dtype: object

In [304]:
tweet_archive_data_clean['breed'] = tweet_archive_data_clean[columns].apply(breed_type,axis=1 )

In [305]:
tweet_archive_data_clean[['breed','doggo', 'floofer','pupper', 'puppo']]

Unnamed: 0,breed,doggo,floofer,pupper,puppo
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,doggo,doggo,,,


In [306]:
tweet_archive_data_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], 
    inplace = True, axis = 1)

In [307]:
tweet_archive_data_clean.breed.unique()

array([None, 'doggo', 'puppo', 'pupper', 'floofer', 'doggo,puppo',
       'doggo,floofer', 'doggo,pupper'], dtype=object)

#### Test

In [308]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2078 entries, 0 to 2355
Data columns (total 7 columns):
tweet_id              2078 non-null int64
timestamp             2078 non-null datetime64[ns]
text                  2078 non-null object
rating_numerator      2078 non-null float64
rating_denominator    2078 non-null int64
name                  1386 non-null object
breed                 336 non-null object
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 129.9+ KB


#### Quality for  tweet_image_data

#### Define
1. Drop the columns img_num,p1_dog,p2, p2_conf, p2_dog, p3, p3_conf,p3_dog
2. Rename the columns jpg_url, p1, p1_conf and p1_dog to descriptive column names

#### Code

In [309]:
tweet_image_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [310]:
tweet_image_data_clean.drop(['img_num','p1_dog','p2', 'p2_conf', 'p2_dog', 'p3', 
                  'p3_conf', 'p3_dog'], inplace = True, axis = 1)

In [311]:
tweet_image_data_clean.rename(columns = {'jpg_url': 'image_url', 
                               'p1': 'prediction', 
                               'p1_conf': 'confidence_level'}, inplace = True)

In [312]:
tweet_image_data_clean.head()

Unnamed: 0,tweet_id,image_url,prediction,confidence_level
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,Welsh_springer_spaniel,0.47
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,redbone,0.51
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,German_shepherd,0.6
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,Rhodesian_ridgeback,0.41
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,miniature_pinscher,0.56


#### Tidiness
Join the three data sets into one single data frame

In [313]:
tweet_archive_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2078 entries, 0 to 2355
Data columns (total 7 columns):
tweet_id              2078 non-null int64
timestamp             2078 non-null datetime64[ns]
text                  2078 non-null object
rating_numerator      2078 non-null float64
rating_denominator    2078 non-null int64
name                  1386 non-null object
breed                 336 non-null object
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 129.9+ KB


In [314]:
tweet_image_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 4 columns):
tweet_id            2075 non-null int64
image_url           2075 non-null object
prediction          2075 non-null object
confidence_level    2075 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 64.9+ KB


In [315]:
tweet_count_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2340 entries, 0 to 2339
Data columns (total 3 columns):
tweet_id          2340 non-null int64
retweet_count     2340 non-null int64
favorite_count    2340 non-null int64
dtypes: int64(3)
memory usage: 54.9 KB


#### Code

In [316]:
# Merge the three data frames
twitter_archive_master = tweet_archive_data_clean.merge(tweet_count_data_clean,
                                                   on = 'tweet_id', how = 'inner')
twitter_archive_master = twitter_archive_master.merge(tweet_image_data_clean, 
                                            on = 'tweet_id', how = 'left')

##### Test

In [317]:
twitter_archive_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2076 entries, 0 to 2075
Data columns (total 12 columns):
tweet_id              2076 non-null int64
timestamp             2076 non-null datetime64[ns]
text                  2076 non-null object
rating_numerator      2076 non-null float64
rating_denominator    2076 non-null int64
name                  1384 non-null object
breed                 336 non-null object
retweet_count         2076 non-null int64
favorite_count        2076 non-null int64
image_url             1950 non-null object
prediction            1950 non-null object
confidence_level      1950 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(4), object(5)
memory usage: 210.8+ KB


In [318]:
twitter_archive_master.query("tweet_id in ('890729181411237888','890240255349198849')")

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,breed,retweet_count,favorite_count,image_url,prediction,confidence_level
7,890729181411237888,2017-07-28 00:22:40,When you watch your owner call another dog a g...,13.0,10,,,18341,63867,https://pbs.twimg.com/media/DFyBahAVwAAhUTd.jpg,Pomeranian,0.57
9,890240255349198849,2017-07-26 15:59:51,This is Cassie. She is a college pup. Studying...,14.0,10,Cassie,doggo,7172,31193,https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg,Pembroke,0.51


In [319]:
tweet_image_data_clean.query("tweet_id in ('890729181411237888','890240255349198849')")

Unnamed: 0,tweet_id,image_url,prediction,confidence_level
2065,890240255349198849,https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg,Pembroke,0.51
2067,890729181411237888,https://pbs.twimg.com/media/DFyBahAVwAAhUTd.jpg,Pomeranian,0.57


In [320]:
tweet_count_data.query("tweet_id in ('890729181411237888','890240255349198849')")

Unnamed: 0,tweet_id,retweet_count,favorite_count
7,890729181411237888,18341,63867
9,890240255349198849,7172,31193


# Store 

In [321]:
# Store tables to files
tweet_count_data.to_csv('tweet_count_data_clean.csv', encoding='utf-8', index=False)
tweet_image_data_clean.to_csv('tweet_image_data_clean.csv', encoding='utf-8', index=False)
tweet_archive_data_clean.to_csv('tweet_archive_data_clean.csv', encoding='utf-8', index=False)
twitter_archive_master.to_csv('twitter_archive_master.csv', encoding='utf-8', index=False)

### Code for data Analysis and Visualization is in act_report.ipynb