# Data Wrangling

## Dataset - WeRateDogs&trade; Twitter Archive

***By Kartik Nanduri***<br>
**Dated: 21st Nov, 2018.**

In [1]:
# importing all the necessary libraries
import os
import pandas as pd
import requests as req
import re

1. [x] **The file given at hand `twitter-archive-enhanced.csv`**

In [2]:
# all the requried files for this project are in the list files_list
files_list = ['twitter-archive-enhanced.csv', 'image-predictions.tsv', 'tweet_json.txt']

In [3]:
# reading the twitter archive file
archive = pd.read_csv(files_list[0])

# taking at random file entries for the archive file
archive.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
393,825876512159186944,,,2017-01-30 01:21:19 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Mo. No one will push him around in the...,,,,https://twitter.com/dog_rates/status/825876512...,11,10,Mo,,,,
1323,706265994973601792,,,2016-03-05 23:51:49 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Kara. She's been trying to solve that ...,,,,https://twitter.com/dog_rates/status/706265994...,11,10,Kara,,,,
556,803638050916102144,,,2016-11-29 16:33:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Pupper hath acquire enemy. 13/10 https://t.co/...,,,,https://twitter.com/dog_rates/status/803638050...,13,10,,,,pupper,
863,762471784394268675,,,2016-08-08 02:13:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Glenn. Being in public scares him. Fright...,,,,https://twitter.com/dog_rates/status/762471784...,12,10,Glenn,,,,
192,855818117272018944,,,2017-04-22 16:18:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",I HEARD HE TIED HIS OWN BOWTIE MARK AND HE JUS...,,,,https://twitter.com/markhalperin/status/855656...,13,10,,,,,


2. [x] **Fetching the data from url and saving it to local drive - `image-predictions.tsv`**

In [4]:
# reading the file from internet using the requests library
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
res = req.get(url)

with open(url.split('/')[-1], mode = "wb") as op_file:
    op_file.write(res.content)

In [5]:
# checking if fetched the data right way
img_pre_test = pd.read_csv(files_list[1], delimiter = "\t", encoding = 'utf-8')
img_pre_test.sample(5)

# we did it the right way, Yay! it worked.

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1404,769695466921623552,https://pbs.twimg.com/media/Cq6B8V6XYAA1T1R.jpg,1,pug,0.407117,True,muzzle,0.165638,False,kuvasz,0.045837,True
542,677228873407442944,https://pbs.twimg.com/media/CWYAEINW4AIuw8P.jpg,1,common_iguana,0.566338,False,tennis_ball,0.154646,False,green_lizard,0.044976,False
508,676098748976615425,https://pbs.twimg.com/media/CWH8L72UkAAvjql.jpg,1,walking_stick,0.162179,False,sandal,0.129086,False,purse,0.081412,False
138,668537837512433665,https://pbs.twimg.com/media/CUcfnWlWsAAzlwE.jpg,1,Lakeland_terrier,0.372988,True,toy_poodle,0.250445,True,Chihuahua,0.189737,True
537,676948236477857792,https://pbs.twimg.com/media/CWUA1GFW4AAowiq.jpg,1,guenon,0.611603,False,macaque,0.135176,False,squirrel_monkey,0.083247,False


3. [x] **Getting data from Twitter&trade;**

In [6]:
# importing all the necessary libraries for accessing Twitter via API
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

In [7]:
# setting up all the necessary placeholders for API
consumer_key = 'xxx.xxx.xxx'
consumer_secret = 'xxx.xxx.xxx'
access_token = 'xxx.xxx.xxx'
access_secret = 'xxx.xxx.xxx'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth_handler = auth,
                 parser = tweepy.parsers.JSONParser(),
                 wait_on_rate_limit = True, 
                 wait_on_rate_limit_notify = True)

In [9]:
def fetch_and_save(ids, api_ins, one_id = None):
    '''
    This function will fetch data with associated id in ids list
    ids (List Object): a list all tweets
    api_ins (Tweepy Object): api object instance, will be used to query twitter for data
    one_id (int): use when you want to query only for one tweet
    failed_ids (List Object): a list will be retured so that, this fuction can be called once again on those ids
    '''
    new_file_name = ''; failed_ids = []
    
    # checking if file exists
    if os.path.exists(files_list[2]):
        temp = [s for s in os.listdir() if "tweet_json" in s]
        new_file_name = files_list[2].split('.')[0] + "_" + str(len(temp)) + ".txt"
    else:
        new_file_name = files_list[2]
    
    # querying a list of ids
    if one_id == None:
        with open(new_file_name, mode = 'w') as outfile:
            for one_id in ids:
                try:
                    page = api_ins.get_status(one_id, tweet_mode='extended')
                    json.dump(page, outfile)
                    outfile.write('\n')
                
                except Exception as e:
                    print("Error for: " + str(one_id) + " - " + str(e))
                    failed_ids.append(one_id)
    
    # querying a single id
    else:
        with open(new_file_name, mode = 'w') as outfile:
            try:
                page = api_ins.get_status(one_id, tweet_mode='extended')
                json.dump(page, outfile)
                outfile.write('\n')
            
            except Exception as e:
                print("Error for: " + str(one_id) + " - " + str(e))
                failed_ids.append(one_id)
    
    return failed_ids

In [10]:
# starting the timer
start = timer()

# passing the list of ids to the fuction fetch_and_save()
tweet_ids = archive['tweet_id'].tolist()

# fetching data 1st iteration
test_one = fetch_and_save(tweet_ids, api)

# ending the timer
end = timer()

# calculating the runtime for fetch_and_save
print("That took about {} mins.".format(round((end - start)/60, 1)))

Error for: 888202515573088257 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 873697596434513921 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 872668790621863937 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 869988702071779329 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 866816280283807744 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 861769973181624320 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 845459076796616705 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 842892208864923648 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 837012587749474308 - [{'code': 144, 'message': 'No status found with that ID.'}]


Rate limit reached. Sleeping for: 399


Error for: 831926988323639298 - Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
Error for: 827228250799742977 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 802247111496568832 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 775096608509886464 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 770743923962707968 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 754011816964026368 - [{'code': 144, 'message': 'No status found with that ID.'}]


Rate limit reached. Sleeping for: 501


Error for: 711652651650457602 - Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))


Rate limit reached. Sleeping for: 508


Error for: 669749430875258880 - Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))
That took about 42.0 mins.


In [11]:
# no of erroneous ids
print("we have about {} failed requests.".format(len(test_one)))

we have about 17 failed requests.


In [12]:
# checking if test_one has duplicate ids
assert len(test_one) == len(set(test_one))

# we can see that there are no duplicates

In [13]:
# passing this failed list to the fetch_and_save function

# starting the timer
start = timer()

# fetching data 2nd iteration
test_two = fetch_and_save(test_one, api)

# ending the timer
end = timer()

# calculating the runtime for fetch_and_save
print("That took about {} secs.".format(round(end - start, 1)))

Error for: 888202515573088257 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 873697596434513921 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 872668790621863937 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 869988702071779329 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 866816280283807744 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 861769973181624320 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 845459076796616705 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 842892208864923648 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 837012587749474308 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 827228250799742977 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 802247111496568832 - [{'code': 144, 'message': 'No status found with 

In [14]:
# no of erroneous ids
print("we have about {} failed requests.".format(len(test_two)))

we have about 14 failed requests.


In [15]:
# checking if test_one has duplicate ids
assert len(test_one) == len(set(test_one))

# we can see that there are no duplicates

In [16]:
# passing this failed list to the fetch_and_save function

# starting the timer
start = timer()

# fetching data 2nd iteration
test_three = fetch_and_save(test_two, api)

# ending the timer
end = timer()

# calculating the runtime for fetch_and_save
print("That took about {} secs.".format(round(end - start, 1)))

Error for: 888202515573088257 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 873697596434513921 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 872668790621863937 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 869988702071779329 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 866816280283807744 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 861769973181624320 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 845459076796616705 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 842892208864923648 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 837012587749474308 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 827228250799742977 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 802247111496568832 - [{'code': 144, 'message': 'No status found with 

In [17]:
# checking if lenght of test_two equals lenght of test_three, if they are the same,
# as per Twitter's words they don't exist any one

assert len(test_two) == len(test_three)

<img src="error.png">

In [18]:
# but to make sure, lets try it for the 4th time

# starting the timer
start = timer()

# fetching data 2nd iteration
test_four = fetch_and_save(test_three, api)

# ending the timer
end = timer()

# calculating the runtime for fetch_and_save
print("That took about {} secs.".format(round(end - start, 1)))

Error for: 888202515573088257 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 873697596434513921 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 872668790621863937 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 869988702071779329 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 866816280283807744 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 861769973181624320 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 845459076796616705 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 842892208864923648 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 837012587749474308 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 827228250799742977 - [{'code': 144, 'message': 'No status found with that ID.'}]
Error for: 802247111496568832 - [{'code': 144, 'message': 'No status found with 

In [19]:
# checking if length of test two, three and four are the same
assert len(test_two) == len(test_three)
assert len(test_three) == len(test_four)
assert len(test_two) == len(test_four)

***<span style="color:red">Important Uncomment the following lines, so that there is no error</span>***

In [None]:
# removing tweet_json files that were created for test_three and test_four - iterations for fetching data
#files = ['tweet_json_2.txt', 'tweet_json_3.txt']
#for file in files:
#    os.remove(file)

4. [x] **Okay, let's combine the successful jsons into one file, called the `tweet_json_master.txt`**

In [25]:
# combing all successful jsons into one master file
json_1 = pd.read_json('tweet_json.txt', lines = True, encoding = 'utf-8')
json_2 = pd.read_json('tweet_json_1.txt', lines = True, encoding = 'utf-8')

# total rows that we need to have in our resulting dataframe
json_1.shape, json_2.shape

((2339, 32), (3, 27))

In [40]:
json_master = pd.concat([json_1, json_2], ignore_index = True, join = 'outer', sort = True)
json_master.to_json('tweet_json_master.txt', orient = 'records', lines = True)

5. [x] **Last thing to do is to tidy up our folder, let's get going.**

In [47]:
# moving all data files under one folder - dataset
# removing the temporary files, that acted as placeholders

# creating the folder
folder = 'dataset'
if not os.path.exists(folder):
    os.mkdir(folder)

# we know that our master datasets for this project are
# 1. twitter-archive-enhanced.csv
# 2. image-predictions.tsv
# 3. tweet_json_master.txt
# let us move these files

# updating our files_list
files_list[-1] = 'tweet_json_master.txt'

# moving only required files
for file in files_list:
    if os.path.exists(file):
        os.rename(file, folder+'/'+file)

# removing the tweet_json and tweet_json_1 files as they are not required anymore
for file in ['tweet_json.txt',
 'tweet_json_1.txt']:
    os.remove(file)
    
# lisitng the current directory
os.listdir()

['.git',
 '.ipynb_checkpoints',
 'dataset',
 'error.png',
 'New Text Document.txt',
 'README.md',
 'wrangle_act.ipynb']