# Main header #

## Introduction ##

## 1. Gathering data ##

The assignment calls for data to be gathered from three different data sources and initially stored in their own Pandas dataframes. To start, the necessary modules are imported.

In [6]:
# First step is to import the modules to be used
import pandas as pd
import numpy as np
import requests
import json
import os

### 1.1. Access WeRateDogs Twitter archive data ###

The first data to access is the WeRateDogs Twitter archive data stored in twitter-archive-enhanced.csv file.

In [7]:
#Use the .read_csv command to read in the file
df_csv=pd.read_csv('twitter-archive-enhanced.csv')

In [8]:
#Use df.head to have a quick look at the structure of the dataframe. Looks like its going to be a fun exercise!
df_csv.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


### 1.2. Access tweet image predictions ###

Thes second set of data to be accessed is the tweet image predictions which is contained in a .tsv file hosted on Udacity's servers and needs to be downloadeded programmatically using the Requests library and the following URL: https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv

In [9]:
#Use requests to access the tsv file programmatically
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response=requests.get(url)
with open (os.path.join("image_predictions"),mode='wb') as file:
    file.write(response.content)

In [10]:
#Use df.head to have a quick look at the structure of the dataframe.
df_image_predict=pd.read_csv('image_predictions', sep='\t')
df_image_predict.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


### 1.3. Access & query data from the Twitter API using Tweepy ###

With a Twitter development application still pending, I used the provided code to access the project data without the necessary access

In [6]:
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file. 
# These are hidden to comply with Twitter's API terms and conditions

import tweepy 
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

In [11]:
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API

tweet_ids = df_csv.tweet_id.values
len(tweet_ids)

2356

In [8]:
# Query Twitter's API for JSON data for each tweet ID in the Twitter archive

# count = 0
# fails_dict = {}
# start = timer()

# Save each tweet's returned JSON as a new line in a .txt file

# with open('tweet_json.txt', 'w') as outfile:
    # for tweet_id in tweet_ids:
        # count += 1
        # print(str(count) + ": " + str(tweet_id))
        # try:
            # tweet = api.get_status(tweet_id, tweet_mode='extended',wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
            # print("Success")
            # json.dump(tweet._json, outfile)
            # outfile.write('\n')
        # except tweepy.TweepError as e:
            # print("Fail")
            # fails_dict[tweet_id] = e
            # pass
# end = timer()
# print(end - start)
# print(fails_dict)

In [12]:
df_list=[]
with open('tweet_json.txt', 'r') as jfile:
    for item in jfile:
        json_data = json.loads(item)
        tweet_id=json_data['id']
        retweet= json_data['retweet_count']
        favorite_count=json_data['favorite_count']
        text=json_data['full_text']
        df_list.append({'tweet_id':tweet_id,'retweet':retweet,'favorite_count':favorite_count,'text':text})

        df_twit=pd.DataFrame(df_list,columns=['tweet_id','retweet','favorite_count','text'])
df_twit.head()

Unnamed: 0,tweet_id,retweet,favorite_count,text
0,892420643555336193,8853,39467,This is Phineas. He's a mystical boy. Only eve...
1,892177421306343426,6514,33819,This is Tilly. She's just checking pup on you....
2,891815181378084864,4328,25461,This is Archie. He is a rare Norwegian Pouncin...
3,891689557279858688,8964,42908,This is Darla. She commenced a snooze mid meal...
4,891327558926688256,9774,41048,This is Franklin. He would like you to stop ca...


## 2. Assess the data ##

Find at least 8 quality issues and 2 tidiness issues in the Wrange_act.ipynb. Display each piece of data visually, and use programmatic assessment.

### 2.1. For WeRateDogs Twitter archive data ###

I started by importing the twitter_enhanced_csv file into Excel in order to get a better visual sense of the data which is included in the zip file as twitter_archived_enhanced.xlsx A couple of things stood out for me from this preliminary assessment:
1. Some records in expanded urls have two urls - these need to be separated.
2. in_reply_to_status, in_reply_to_user_id are floats, and look like they should be integers or possibly strings
3. numerator_data has at least one error - line 51 of excel spreadsheet records a score of 5 when the 'text' column shows that the score was 13.5
4. There are some large numerators in the numerator_data column but I'm not going to be concerned with these as the supporting project information indicates that this is acceptable. However, these could be removed as outliers during cleaning.

In [53]:
# Visually having a look at the data
df_csv.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [14]:
#Having a preliminary scan of the data types and number of records
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [15]:
# We can see that although there are 2356 data elements for doggo, 97 are 'doggo' objects whilst 2259 are classified as 'none'
df_csv['doggo'].value_counts()

None     2259
doggo      97
Name: doggo, dtype: int64

In [16]:
# We can see that although there are 2356 data elements for floofer, 10 are 'doggo' objects whilst 2346 are classified as 'none'
df_csv['floofer'].value_counts()

None       2346
floofer      10
Name: floofer, dtype: int64

In [17]:
# We can see that although there are 2356 data elements for pupper, 257 are 'doggo' objects whilst 2099 are classified as 'none'
df_csv['pupper'].value_counts()

None      2099
pupper     257
Name: pupper, dtype: int64

In [18]:
# We can see that although there are 2356 data elements for puppo, 30 are 'doggo' objects whilst 2326 are classified as 'none'
df_csv['puppo'].value_counts()

None     2326
puppo      30
Name: puppo, dtype: int64

In [19]:
# Dog classifications (doggo, floofer etc) are not comprehensive; there are only 394 classifications out of a possible 2356
types=97+10+257+30
types

394

In [20]:
# Check for duplications - there are zero dupes of tweet_ids in this dataframe
sum(df_csv['tweet_id'].duplicated())

0

In [21]:
# check veracity of names - there are 1399 names that are duplicated. This may not be a problem, but there could be quality issues in the name
sum(df_csv['name'].duplicated())

1399

In [22]:
# Check the different value counts in name - although we can't see all value counts, we can see there is missing data (745 records have no names, and 55 have 'a')
df_csv['name'].value_counts()

None           745
a               55
Charlie         12
Oliver          11
Cooper          11
              ... 
infuriating      1
Dudley           1
Boots            1
Freddery         1
Akumi            1
Name: name, Length: 957, dtype: int64

In [23]:
#Similarly can use sort_values to see another quality issue - some records have 'very'
df_csv['name'].sort_values()

1035     Abby
1021     Abby
938       Ace
1933     Acro
1327    Adele
        ...  
1031     very
773      very
1097     very
819      very
1385     very
Name: name, Length: 2356, dtype: object

In [24]:
# not the most elegant way of checking validity of all names, but we can see lots of 'None', and then right at the end,
# alot of words not capitalised that look as if they may have been part of the 'text' field or mistyped
name_check=df_csv['name'].sort_values().tolist()
print(name_check)

['Abby', 'Abby', 'Ace', 'Acro', 'Adele', 'Aiden', 'Aja', 'Akumi', 'Al', 'Albert', 'Albert', 'Albus', 'Albus', 'Aldrick', 'Alejandro', 'Alexander', 'Alexanderson', 'Alf', 'Alfie', 'Alfie', 'Alfie', 'Alfie', 'Alfie', 'Alfy', 'Alice', 'Alice', 'Amber', 'Ambrose', 'Amy', 'Amélie', 'Anakin', 'Anakin', 'Andru', 'Andy', 'Angel', 'Anna', 'Anthony', 'Antony', 'Apollo', 'Aqua', 'Archie', 'Archie', 'Archie', 'Archie', 'Arlen', 'Arlo', 'Arnie', 'Arnie', 'Arnie', 'Arnold', 'Arya', 'Ash', 'Ash', 'Asher', 'Ashleigh', 'Aspen', 'Aspen', 'Astrid', 'Astrid', 'Atlas', 'Atlas', 'Atticus', 'Atticus', 'Aubie', 'Augie', 'Autumn', 'Ava', 'Ava', 'Axel', 'Axel', 'Bailey', 'Bailey', 'Bailey', 'Bailey', 'Bailey', 'Bailey', 'Bailey', 'Baloo', 'Baloo', 'Balto', 'Balto', 'Banditt', 'Banjo', 'Barclay', 'Barney', 'Baron', 'Barry', 'Batdog', 'Bauer', 'Baxter', 'Baxter', 'Bayley', 'BeBe', 'Bear', 'Bear', 'Bear', 'Bear', 'Beau', 'Beau', 'Beau', 'Beau', 'Beckham', 'Beebop', 'Beemo', 'Bell', 'Bell', 'Bella', 'Bella', 'Bella

In [25]:
# there are no missing text descriptions, but that is not to say that what's in those text descriptions is correct!
sum(df_csv['text'].isnull())

0

In [60]:
#Checking for consistency of denominator
df_csv['rating_denominator'].value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

<b>Identified quality and tidiness issues</b>
1. 'Source' has the url beginning <ahref= '
2. Timestamp has date and timestamp in one column (tidiness issue)
3. Timestamp's data type is an object not date/time
4. There is missing data for in_repy_to_status_id, in_reply_to_user_d, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp
5. 'None' is incorrectly classified for the columns doggo, floofer, pupper, and puppo
6. 'Doggo', 'floofer', 'pupper', and 'puppo' all have a separate column, but they are all one variable corresponding to a dogtionary defintion (tidiness issue)
7. There is incomplete information for the dogtionary defintions - only 394 records are recorded out of a possible 2356
8. There is missing and incorrect data in names
9. tweet_id is an integer but might better be served as a string
10. Some records in expanded urls have two urls - these need to be separated.
11. in_reply_to_status, in_reply_to_user_id are floats, and look like they should be integers or possibly strings
12. numerator_data has at least one error - line 51 of excel spreadsheet records a score of 5 when the 'text' column shows that the score was 13.5
13. rating_denominator has 23 records that don't conform to the standard of 10.
14. There are retweets that need to be accounted for

### 2.2. Image predictions ###

In [42]:
# Visually having a look at the data
df_image_predict.head(10)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
5,666050758794694657,https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg,1,Bernese_mountain_dog,0.651137,True,English_springer,0.263788,True,Greater_Swiss_Mountain_dog,0.016199,True
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,0.045885,False,terrapin,0.017885,False
7,666055525042405380,https://pbs.twimg.com/media/CT5N9tpXIAAifs1.jpg,1,chow,0.692517,True,Tibetan_mastiff,0.058279,True,fur_coat,0.054449,False
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,shopping_cart,0.962465,False,shopping_basket,0.014594,False,golden_retriever,0.007959,True
9,666058600524156928,https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg,1,miniature_poodle,0.201493,True,komondor,0.192305,True,soft-coated_wheaten_terrier,0.082086,True


In [47]:
df_image_predict.tail(10)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
2065,890240255349198849,https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg,1,Pembroke,0.511319,True,Cardigan,0.451038,True,Chihuahua,0.029248,True
2066,890609185150312448,https://pbs.twimg.com/media/DFwUU__XcAEpyXI.jpg,1,Irish_terrier,0.487574,True,Irish_setter,0.193054,True,Chesapeake_Bay_retriever,0.118184,True
2067,890729181411237888,https://pbs.twimg.com/media/DFyBahAVwAAhUTd.jpg,2,Pomeranian,0.566142,True,Eskimo_dog,0.178406,True,Pembroke,0.076507,True
2068,890971913173991426,https://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg,1,Appenzeller,0.341703,True,Border_collie,0.199287,True,ice_lolly,0.193548,False
2069,891087950875897856,https://pbs.twimg.com/media/DF3HwyEWsAABqE6.jpg,1,Chesapeake_Bay_retriever,0.425595,True,Irish_terrier,0.116317,True,Indian_elephant,0.076902,False
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.22577,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True
2074,892420643555336193,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,1,orange,0.097049,False,bagel,0.085851,False,banana,0.07611,False


We can already see that there are some discrepancies. For example, 'https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg' is indeed a box turtle, and is already highly predicted to not be a dog, but 'https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg' is classified as being a shopping cart, when actually its very much a dog (looks like a golden labrador) in a shopping cart (a false negative.)

In [33]:
# Use .info() to get a sense of the data types and non-null records
df_image_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [44]:
# Are there duplicate tweet_ids?
sum(df_image_predict['tweet_id'].duplicated())

0

There are no duplicate tweet_ids

In [46]:
# Are there duplicate jpg_urls?
sum(df_image_predict['jpg_url'].duplicated())

66

In [41]:
# To get a sense of image projections
df_image_predict['p1'].value_counts()

golden_retriever      150
Labrador_retriever    100
Pembroke               89
Chihuahua              83
pug                    57
                     ... 
slug                    1
coil                    1
dhole                   1
bee_eater               1
canoe                   1
Name: p1, Length: 378, dtype: int64

<b>Identified quality and tidiness issues</b>
1. Accuracy issues in predictions based on images
2. This dataframe should be combined with df_csv and redundant columns dropped (tidiness issue)

### 2.3. Dataframe for the 'tweet_json.txt' file ###

In [26]:
# Visually having a look at the data
df_twit.head()

Unnamed: 0,tweet_id,retweet,favorite_count,text
0,892420643555336193,8853,39467,This is Phineas. He's a mystical boy. Only eve...
1,892177421306343426,6514,33819,This is Tilly. She's just checking pup on you....
2,891815181378084864,4328,25461,This is Archie. He is a rare Norwegian Pouncin...
3,891689557279858688,8964,42908,This is Darla. She commenced a snooze mid meal...
4,891327558926688256,9774,41048,This is Franklin. He would like you to stop ca...


In [27]:
#Having a preliminary scan of the data types and number of records - we can see that there are two missing data elements (2354 vs 2356)
df_twit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_id        2354 non-null   int64 
 1   retweet         2354 non-null   int64 
 2   favorite_count  2354 non-null   int64 
 3   text            2354 non-null   object
dtypes: int64(3), object(1)
memory usage: 73.7+ KB


In [30]:
# 'favorite'
df_twit['favorite_count'].value_counts()

0        179
610        3
345        3
2918       3
1691       3
        ... 
33345      1
814        1
23108      1
2630       1
8143       1
Name: favorite_count, Length: 2007, dtype: int64

<b>Identified quality and tidiness issues</b>
1. Two missing records
2. This dataframe should be combined with df_csv and redundant columns dropped (tidiness issue)

### References ###
1. Tweet objects from Twitter Development: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
2. Getting the json data into a dataframe: https://knowledge.udacity.com/questions/286527
3. A beginners guide to Tweepy by T. Boyle in Medium: https://towardsdatascience.com/my-first-twitter-app-1115a327349e