# Data Wrangling

## Dataset - WeRateDogs&trade; Twitter Archive

***By Kartik Nanduri***<br>
**Dated: 21st Nov, 2018.**

In [1]:
# importing all the necessary libraries
import os
import pandas as pd
import requests as req

1. [x] **The file given at hand `twitter-archive-enhanced.csv`**

In [2]:
# all the requried files for this project are in the list files_list
files_list = ['twitter-archive-enhanced.csv', 'image-predictions.tsv', 'tweet_json_raw.txt']

# dataset folder
folder = 'dataset'

# checking if the any of the files are in parent directory
# if it exists we are moving the file under the dataset's folder
for file in files_list:
    if os.path.exists(file):
        os.rename(file, folder+"/"+file)

# printing the folder content
print("All data files have been moved to folder : " + folder + "\nListing the contents of folder : " + 
      folder +"\n", os.listdir(folder))

All data files have been moved to folder : dataset
Listing the contents of folder : dataset
 ['twitter-archive-enhanced.csv']


In [3]:
# reading the twitter archive file
archive = pd.read_csv(folder+"/"+files_list[0])

# taking at random file entries for the archive file
archive.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2091,670783437142401025,,,2015-11-29 01:56:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Flamboyant pup here. Probably poisonous. Won't...,,,,https://twitter.com/dog_rates/status/670783437...,1,10,,,,,
892,759447681597108224,,,2016-07-30 17:56:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Oakley. He has no idea what happened h...,,,,https://twitter.com/dog_rates/status/759447681...,11,10,Oakley,,,,
2100,670704688707301377,,,2015-11-28 20:43:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Danny. He's too good to look at the road ...,,,,https://twitter.com/dog_rates/status/670704688...,6,10,Danny,,,,
2112,670435821946826752,,,2015-11-28 02:55:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Two unbelievably athletic dogs here. Great for...,,,,https://twitter.com/dog_rates/status/670435821...,10,10,,,,,
2326,666411507551481857,,,2015-11-17 00:24:19 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is quite the dog. Gets really excited whe...,,,,https://twitter.com/dog_rates/status/666411507...,2,10,quite,,,,


2. [x] **Fetching the data from url and saving it to local drive - `image-predictions.tsv`**

In [4]:
# reading the file from internet using the requests library
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
res = req.get(url)

# saving the contents of the response variable to local drive
if os.path.exists(folder+"/"+files_list[1]):
    print("File Exists")
else:
    with open(os.path.join(folder+"/", url.split('/')[-1]), mode = "wb") as op_file:
        op_file.write(res.content)

In [5]:
# checking if fetched the data right way
img_pre_test = pd.read_csv(folder+"/"+files_list[1], delimiter="\t")
img_pre_test.sample(5)

# we did it the right way, Yay! it worked.

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1514,786664955043049472,https://pbs.twimg.com/media/CurLmoqXgAEPoJ-.jpg,1,Leonberg,0.512034,True,keeshond,0.464816,True,Pomeranian,0.007812,True
582,678798276842360832,https://pbs.twimg.com/media/CWuTbAKUsAAvZHh.jpg,1,Airedale,0.583122,True,silky_terrier,0.129567,True,Lakeland_terrier,0.094727,True
1089,719332531645071360,https://pbs.twimg.com/media/CfuVGl3WEAEKb16.jpg,1,Dandie_Dinmont,0.224415,True,miniature_poodle,0.204882,True,Norfolk_terrier,0.090633,True
98,667766675769573376,https://pbs.twimg.com/media/CURiQMnUAAAPT2M.jpg,1,fire_engine,0.883493,False,tow_truck,0.074734,False,jeep,0.012773,False
1505,785264754247995392,https://pbs.twimg.com/media/CuXSHNnWcAIWEwn.jpg,1,teddy,0.674893,False,cradle,0.05674,False,chow,0.056137,True


3. [x] **Getting data from Twitter&trade;**

In [6]:
# importing all the necessary libraries for accessing Twitter via API
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

In [7]:
# setting up all the necessary placeholders for API
consumer_key = 'x'
consumer_secret = 'x'
access_token = 'x'
access_secret = 'x'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth_handler = auth,
                 parser = tweepy.parsers.JSONParser(),
                 wait_on_rate_limit = True, 
                 wait_on_rate_limit_notify = True)

In [None]:
# saving all 
tweet_ids = archive['tweet_id']

# empty list to save all the erroneous ids
failed_ids = []

# starting the timer
start = timer()

# checking if the dataset folder has tweet_json.txt file
if os.path.exists(os.path.join(folder+"/", files_list[2])):
    print("File exists")
else:
    with open(os.path.join(folder+"/", files_list[2]), mode = "w") as op_file:
        for tweet_id in tweet_ids:
            try:
                page = api.get_status(tweet_id, tweet_mode = 'extended')
                json.dump(page, op_file)
                op_file.write('\n')
            except Exception as e:
                print("Error for: " + str(tweet_id) + " - " + str(e))
                failed_ids.append(tweet_id)

# ending the timer
end = timer()

# calculating the runtime for the fetch
print("It took about {} mins to fetch data from API".format(round((end - start)/60, 0)))

In [None]:
# checking the erroneous ids and die trying to fetch details for them
len(failed_ids)

# so we about tweets that the API failed to fetch or can be with Twitter, can't be told for sure. ;P

In [None]:
# but the successful ones are
len(tweet_ids) - len(failed_ids)

# about , were successfully retrieved

In [None]:
# lets try to fetch these missing ids...
failed_ids_2 = []

# starting the timer
start = timer()

with open(os.path.join(folder+"/", 'twitter_failed.txt'), mode = "w") as op_file:
    for error in failed_ids:
        try:
            page = api.get_status(error, tweet_mode = 'extended')
            json.dump(page, op_file)
            op_file.write('\n')
        except Exception as e:
            print("Error for: " + str(tweet_id) + " - " + str(e))
            failed_ids_2.append(tweet_id)

# ending the timer
end = timer()

# calculating the runtime for the fetch
print("It took about {} mins to fetch data from API".format(round((end - start)/60, 0)))

**Tried mutiple times to fetch data for the above `tweet_id` - `666020888022790149`, no solution.**<br>
**It can also be possible if someone else tries this, she/he may not get this error.**

4. [x] ***Lastly lets combine, the `twitter_failed.txt` and `tweet_json_raw.txt` into one file `tweet_json.txt`***

In [None]:
# appending twitter_failed.txt to tweet_json.txt
filenames = ['twitter_failed.txt', 'tweet_json_raw.txt']
with open(folder+'/'+'tweet_json.txt', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

In [None]:
# list the directory for our dataset folder
os.listdir(folder)

# As expected, we have all of our required files :)