## Project: Wrangle and Analyze Data

In [1]:
# import standard libs
from pathlib import Path
import requests
import os
from dotenv import load_dotenv, find_dotenv
import tweepy
import json
# import python scientific stack
import pandas as pd
import numpy as np

# import visual tools
import matplotlib.pyplot as plt
%matplotlib inline

### Directory and folder names for where project files and data will be stored

In [39]:
# directory path for Project_Wrangle_and_Analyze_Data
dirname = Path(r'C:\Users\rj71b\RMIT Course\Project_Wrangle_and_Analyze_Data\data')

# folder name for where raw data will be stored
raw_data_folder = 'raw'

# folder name for where processed (ie cleaned) data will be stored
processed_data_folder = 'processed'

# file name for WeRateDogs Twitter archive
twitter_archive = 'twitter-archive-enhanced.csv'

## Gathering Data for Project

#### WeRateDogs Twitter Archive

In [40]:
# read csv file into pandas dataframe
df_archive = pd.read_csv(dirname / raw_data_folder / twitter_archive)

### Assess Data

In [41]:
df_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [5]:
df_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [6]:
df_archive.name.value_counts()

None       745
a           55
Charlie     12
Oliver      11
Cooper      11
          ... 
Huxley       1
Rodman       1
Carbon       1
Oddie        1
Cheryl       1
Name: name, Length: 957, dtype: int64

#### Tweet image predictions

In [42]:
# folder name is raw
if not os.path.exists(dirname / raw_data_folder):
    os.makedirs(dirname / raw_data_folder)

In [43]:
#  tweet image predictions url link
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

In [44]:
with open(os.path.join(dirname / raw_data_folder,
                      url.split('/')[-1]), mode='wb') as file:
    file.write(response.content)

In [45]:
df_tweet_image = pd.read_csv(dirname / raw_data_folder / 'image-predictions.tsv',
                            sep='\t')




In [46]:
df_tweet_image

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.225770,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


In [12]:
df_tweet_image.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


## Query Twitter Data

In [13]:
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)


consumer_key = os.environ.get("CONSUMER_KEY") 
consumer_secret = os.environ.get("CONSUMER_SECRET")
access_token = os.environ.get("ACCESS_TOKEN")
access_secret = os.environ.get("ACCESS_SECRET")

In [14]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

Resources:

- [Reading and Writing JSON to a File in Python](https://stackabuse.com/reading-and-writing-json-to-a-file-in-python/)

In [19]:
errors = {}
data = []
tweet_id_list = df_archive.tweet_id.to_list()
for tweet_count, tweet_id in enumerate(tweet_id_list, 1):
    try:
        tweet = api.get_status(tweet_id, tweet_mode='extended')
        json_content = tweet._json
        data.append(json_content)
        with open(os.path.join(dirname, raw_data_folder, 'tweet_json.txt'), 'w') as outfile:
            json.dump(data, outfile)

    except tweepy.TweepError as e:
        print(str(tweet_count) + ":" + str(tweet_id) +  ":" + str(e))
        errors[str(tweet_count) + ":" + str(tweet_id)] = e.response.json()['errors'][0]['message']
        with open(os.path.join(dirname, raw_data_folder, 'status_error.txt'), 'w') as errorfile:
                json.dump(errors, errorfile)
        continue
        

20:888202515573088257:[{'code': 144, 'message': 'No status found with that ID.'}]
96:873697596434513921:[{'code': 144, 'message': 'No status found with that ID.'}]
102:872668790621863937:[{'code': 144, 'message': 'No status found with that ID.'}]
105:872261713294495745:[{'code': 144, 'message': 'No status found with that ID.'}]
119:869988702071779329:[{'code': 144, 'message': 'No status found with that ID.'}]
133:866816280283807744:[{'code': 144, 'message': 'No status found with that ID.'}]
156:861769973181624320:[{'code': 144, 'message': 'No status found with that ID.'}]
183:856602993587888130:[{'code': 144, 'message': 'No status found with that ID.'}]
212:851953902622658560:[{'code': 144, 'message': 'No status found with that ID.'}]
248:845459076796616705:[{'code': 144, 'message': 'No status found with that ID.'}]
254:844704788403113984:[{'code': 144, 'message': 'No status found with that ID.'}]
261:842892208864923648:[{'code': 144, 'message': 'No status found with that ID.'}]
297:83

### Read in json file

In [47]:
# empty list to read json data into and convert to pandas dataframe
df_list = []

with open(os.path.join(dirname, raw_data_folder, 'tweet_json.txt')) as json_file:
    data = json.load(json_file)
    for tweet in data:
        df_list.append({'tweet_id': tweet['id'],
                       'retweet_count' : tweet['retweet_count'],
                       'favorite_count' : tweet['favorite_count']})

In [48]:
df_tweet_json = pd.DataFrame(df_list, columns=['tweet_id', 'retweet_count', 'favorite_count'])

In [49]:
df_tweet_json.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2333 entries, 0 to 2332
Data columns (total 3 columns):
tweet_id          2333 non-null int64
retweet_count     2333 non-null int64
favorite_count    2333 non-null int64
dtypes: int64(3)
memory usage: 54.8 KB


In [50]:
data[1]

{'created_at': 'Tue Aug 01 00:17:27 +0000 2017',
 'id': 892177421306343426,
 'id_str': '892177421306343426',
 'full_text': "This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV",
 'truncated': False,
 'display_text_range': [0, 138],
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [],
  'media': [{'id': 892177413194625024,
    'id_str': '892177413194625024',
    'indices': [139, 162],
    'media_url': 'http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
    'url': 'https://t.co/0Xxu71qeIV',
    'display_url': 'pic.twitter.com/0Xxu71qeIV',
    'expanded_url': 'https://twitter.com/dog_rates/status/892177421306343426/photo/1',
    'type': 'photo',
    'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
     'medium': {'w': 1055, 'h': 1200, 'resize': 'fit'},
     'small': {'