# Data Wrangling

## Dataset - WeRateDogs&trade; Twitter Archive

***By Kartik Nanduri***<br>
**Dated: 21st Nov, 2018.**

In [1]:
# importing all the necessary libraries
import os
import pandas as pd
import requests as req

1. [x] **The file given at hand `twitter-archive-enhanced.csv`**

In [2]:
# all the requried files for this project are in the list files_list
files_list = ['twitter-archive-enhanced.csv', 'image-predictions.tsv', 'tweet_json_raw.txt']

In [3]:
# reading the twitter archive file
archive = pd.read_csv(files_list[0])

# taking at random file entries for the archive file
archive.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1558,688789766343622656,,,2016-01-17 18:27:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Pubert. He's a Kerplunk Rumplestilt. Cann...,,,,https://twitter.com/dog_rates/status/688789766...,8,10,Pubert,,,,
585,800018252395122689,,,2016-11-19 16:49:49 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a doggo doin a struggle. 11/10 much det...,,,,https://twitter.com/dog_rates/status/800018252...,11,10,,doggo,,,
2338,666104133288665088,,,2015-11-16 04:02:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Not familiar with this breed. No tail (weird)....,,,,https://twitter.com/dog_rates/status/666104133...,1,10,,,,,
1627,684594889858887680,,,2016-01-06 04:38:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...","""FOR THE LAST TIME I DON'T WANNA PLAY TWISTER ...",,,,https://twitter.com/dog_rates/status/684594889...,10,10,,,,,
465,817181837579653120,,,2017-01-06 01:31:47 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Here's a pupper with squeaky hi...,8.159661e+17,4196984000.0,2017-01-02 17:00:46 +0000,https://twitter.com/dog_rates/status/815966073...,13,10,,,,pupper,


2. [x] **Fetching the data from url and saving it to local drive - `image-predictions.tsv`**

In [4]:
# reading the file from internet using the requests library
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
res = req.get(url)

with open(url.split('/')[-1], mode = "wb") as op_file:
    op_file.write(res.content)

In [5]:
# checking if fetched the data right way
img_pre_test = pd.read_csv(files_list[1], delimiter = "\t", encoding = 'utf-8')
img_pre_test.sample(5)

# we did it the right way, Yay! it worked.

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1485,781661882474196992,https://pbs.twimg.com/media/CtkFS72WcAAiUrs.jpg,1,Pembroke,0.438087,True,golden_retriever,0.226954,True,collie,0.070652,True
143,668620235289837568,https://pbs.twimg.com/media/CUdqjvAWUAANfoU.jpg,1,crash_helmet,0.757942,False,toaster,0.037497,False,mouse,0.027271,False
100,667782464991965184,https://pbs.twimg.com/media/CURwm3cUkAARcO6.jpg,1,lorikeet,0.466149,False,hummingbird,0.083011,False,African_grey,0.054247,False
1463,778396591732486144,https://pbs.twimg.com/media/CcG07BYW0AErrC9.jpg,1,hippopotamus,0.581403,False,doormat,0.152445,False,sea_lion,0.026364,False
221,670093938074779648,https://pbs.twimg.com/media/CUym4Y5WsAEiI9_.jpg,1,toy_poodle,0.383346,True,miniature_poodle,0.153678,True,chow,0.138543,True


3. [x] **Getting data from Twitter&trade;**

In [6]:
# importing all the necessary libraries for accessing Twitter via API
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

In [None]:
# setting up all the necessary placeholders for API
consumer_key = 'x'
consumer_secret = 'x'
access_token = 'x'
access_secret = 'x'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth_handler = auth,
                 wait_on_rate_limit = True, 
                 wait_on_rate_limit_notify = True)

In [None]:
# empty list to save all the erroneous ids
failed_ids = []

# starting the timer
start = timer()

with open(files_list[2], mode = "w") as op_file:
    for tweet_id in archive['tweet_id']:
        try:
            page = api.get_status(tweet_id, tweet_mode = 'extended')
            json.dump(page._json, op_file)
            op_file.write('\n')
        except Exception as e:
            print("Error for: " + str(tweet_id) + " - " + str(e))
            failed_ids.append(tweet_id)

# ending the timer
end = timer()

# calculating the runtime for the fetch
print("It took about {} mins to fetch data from API".format(round((end - start)/60, 1)))

In [None]:
# checking the erroneous ids and die trying to fetch details for them
len(failed_ids)

# so we about tweets that the API failed to fetch or can be with Twitter, can't be told for sure. ;P

In [None]:
# but the successful ones are
len(tweet_ids) - len(failed_ids)

# about , were successfully retrieved

In [None]:
# lets try to fetch these missing ids...
failed_ids_2 = []

# starting the timer
start = timer()

for error in failed_ids:
    try:
        page = api.get_status(error, tweet_mode = 'extended')
        json.dump(page, op_file)
        op_file.write('\n')
    except Exception as e:
        print("Error for: " + str(tweet_id) + " - " + str(e))
        failed_ids_2.append(tweet_id)

# ending the timer
end = timer()

# calculating the runtime for the fetch
print("It took about {} secs to fetch data from API".format(round((end - start), 2)))

**Tried mutiple times to fetch data for the above `tweet_id` - `666020888022790149`, no solution.**<br>
**It can also be possible if someone else tries this, she/he may not get this error.**

4. [x] ***Lastly lets combine, the `twitter_failed.txt` and `tweet_json_raw.txt` into one file `tweet_json.txt`***

In [None]:
# appending twitter_failed.txt to tweet_json.txt
filenames = ['tweets_failed.txt', 'tweet_json_raw.txt']

# starting the timer
start = timer()

with open(folder+'/'+'tweet_json.txt', 'w') as outfile:
    for fname in filenames:
        with open(folder+'/'+fname) as infile:
            for line in infile:
                outfile.write(line)
                
# starting the timer
end = timer()

# calculating the runtime for appending
print("It took about {} secs for appending".format(round((end - start), 2)))

In [None]:
# list the directory for our dataset folder
os.listdir(folder)

# As expected, we have all of our required files :)

In [None]:
test_json = pd.read_json(folder+'/'+'tweet_json.txt', lines = True, encoding = 'utf-8')
test_json.info()

In [None]:
test_json = pd.read_json(folder+'/'+'tweet-json-1.txt', lines = True, encoding = 'utf-8')
test_json.info()