# Data Wrangling

## Dataset - WeRateDogs&trade; Twitter Archive

***By: Kartik Nanduri***<br>
**Date: 21st Nov, 2018.**

## Gathering

In [1]:
# importing all the necessary libraries
import os
import pandas as pd
import requests as req
import re

***<span style="color: red">Important! uncomment the following files to run the book with out errors</span>***

In [2]:
# resetting the folder structure.
#os.rename('dataset/twitter-archive-enhanced.csv', 'twitter-archive-enhanced.csv')
#import shutil
#shutil.rmtree('dataset')

***<span style="color: green">Important! once done, please recomment.</span>***

1. [x] **The file given at hand `twitter-archive-enhanced.csv`**

In [3]:
# all the requried files for this project are in the list files_list
files_list = ['twitter-archive-enhanced.csv', 'image-predictions.tsv', 'tweet_json.txt']

In [4]:
# reading the twitter archive file
archive = pd.read_csv(files_list[0])

# taking at random file entries for the archive file
archive.sample(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
622,796080075804475393,,,2016-11-08 20:00:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Yogi. He's 98% floof. Snuggable af. 12...,,,,https://twitter.com/dog_rates/status/796080075...,12,10,Yogi,,,,


2. [x] **Fetching the data from url and saving it to local drive - `image-predictions.tsv`**

In [5]:
# reading the file from internet using the requests library
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
res = req.get(url)

with open(url.split('/')[-1], mode = "wb") as op_file:
    op_file.write(res.content)

In [6]:
# checking if fetched the data right way
img_pre_test = pd.read_csv(files_list[1], delimiter = "\t", encoding = 'utf-8')
img_pre_test.sample(2)

# we did it the right way, Yay! it worked.

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
95,667550904950915073,https://pbs.twimg.com/media/CUOb_gUUkAACXdS.jpg,1,web_site,0.999335,False,vizsla,8.1e-05,True,collie,6.9e-05,True
1337,758405701903519748,https://pbs.twimg.com/media/CoZl9fXWgAMox0n.jpg,4,Chesapeake_Bay_retriever,0.702954,True,laptop,0.092277,False,notebook,0.032727,False


3. [x] **Getting data from Twitter&trade;**

In [7]:
# importing all the necessary libraries for accessing Twitter via API
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

In [None]:
# setting up all the necessary placeholders for API
consumer_key = 'xxx.xxx.xxx'
consumer_secret = 'xxx.xxx.xxx'
access_token = 'xxx.xxx.xxx'
access_secret = 'xxx.xxx.xxx'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth_handler = auth,
                 parser = tweepy.parsers.JSONParser(),
                 wait_on_rate_limit = True, 
                 wait_on_rate_limit_notify = True)

In [None]:
def fetch_and_save(ids, api_ins, one_id = None):
    '''
    This function will fetch data with associated id in ids list
    ids (List Object): a list all tweets
    api_ins (Tweepy Object): api object instance, will be used to query twitter for data
    one_id (int): use when you want to query only for one tweet
    failed_ids (List Object): a list will be retured so that, this fuction can be called once again on those ids
    '''
    new_file_name = ''; failed_ids = []
    
    # checking if file exists
    if os.path.exists(files_list[2]):
        temp = [s for s in os.listdir() if "tweet_json" in s]
        new_file_name = files_list[2].split('.')[0] + "_" + str(len(temp)) + ".txt"
    else:
        new_file_name = files_list[2]
    
    # querying a list of ids
    if one_id == None:
        with open(new_file_name, mode = 'w') as outfile:
            for one_id in ids:
                try:
                    page = api_ins.get_status(one_id, tweet_mode='extended')
                    json.dump(page, outfile)
                    outfile.write('\n')
                
                except Exception as e:
                    print("Error for: " + str(one_id) + " - " + str(e))
                    failed_ids.append(one_id)
    
    # querying a single id
    else:
        with open(new_file_name, mode = 'w') as outfile:
            try:
                page = api_ins.get_status(one_id, tweet_mode='extended')
                json.dump(page, outfile)
                outfile.write('\n')
            
            except Exception as e:
                print("Error for: " + str(one_id) + " - " + str(e))
                failed_ids.append(one_id)
    
    return failed_ids

In [None]:
# starting the timer
start = timer()

# passing the list of ids to the fuction fetch_and_save()
tweet_ids = archive['tweet_id'].tolist()

# fetching data 1st iteration
test_one = fetch_and_save(tweet_ids, api)

# ending the timer
end = timer()

# calculating the runtime for fetch_and_save
print("That took about {} mins.".format(round((end - start)/60, 1)))

In [None]:
# no of erroneous ids
print("we have about {} failed requests.".format(len(test_one)))

In [None]:
# checking if test_one has duplicate ids
assert len(test_one) == len(set(test_one))

# we can see that there are no duplicates

In [None]:
# passing this failed list to the fetch_and_save function

# starting the timer
start = timer()

# fetching data 2nd iteration
test_two = fetch_and_save(test_one, api)

# ending the timer
end = timer()

# calculating the runtime for fetch_and_save
print("That took about {} secs.".format(round(end - start, 1)))

In [None]:
# no of erroneous ids
print("we have about {} failed requests.".format(len(test_two)))

In [None]:
# checking if test_one has duplicate ids
assert len(test_one) == len(set(test_one))

# we can see that there are no duplicates

4. [x] **Okay, let's combine the successful jsons into one file, called the `tweet_json_master.txt`**

In [None]:
# combing all successful jsons into one master file
json_1 = pd.read_json('tweet_json.txt', lines = True, encoding = 'utf-8')
json_2 = pd.read_json('tweet_json_1.txt', lines = True, encoding = 'utf-8')

# total rows that we need to have in our resulting dataframe
json_1.shape, json_2.shape

In [None]:
json_master = pd.concat([json_1, json_2], ignore_index = True, join = 'outer', sort = True)
json_master.to_json('tweet_json_master.txt', orient = 'records', lines = True)

5. [x] **Last thing to do is to tidy up our folder, let's get going.**

In [None]:
# moving all data files under one folder - dataset
# removing the temporary files, that acted as placeholders

# creating the folder
folder = 'dataset'
if not os.path.exists(folder):
    os.mkdir(folder)

# we know that our master datasets for this project are
# 1. twitter-archive-enhanced.csv
# 2. image-predictions.tsv
# 3. tweet_json_master.txt
# let us move these files

# updating our files_list
files_list[-1] = 'tweet_json_master.txt'

# moving only required files
for file in files_list:
    if os.path.exists(file):
        os.rename(file, folder+'/'+file)

# removing the tweet_json and tweet_json_1 files as they are not required anymore
for file in ['tweet_json.txt', 'tweet_json_1.txt']:
    os.remove(file)
    
# lisitng the current directory
os.listdir()

# clean and neat, lets get with assessing and cleaning

## Assessing