# Data Wrangling

## Dataset - WeRateDogs&trade; Twitter Archive

***By Kartik Nanduri***<br>
**Dated: 21st Nov, 2018.**

In [1]:
# importing all the necessary libraries
import os
import pandas as pd
import requests as req

1. [x] **The file given at hand `twitter-archive-enhanced.csv`**

In [2]:
# all the requried files for this project are in the list files_list
files_list = ['twitter-archive-enhanced.csv', 'image-predictions.tsv', 'tweet_json.txt']

# dataset folder
folder = 'dataset'

# checking if the any of the files are in parent directory
# if it exists we are moving the file under the dataset's folder
for file in files_list:
    if os.path.exists(file):
        os.rename(file, folder+"/"+file)

# printing the folder content
print("All data files have been moved to folder : " + folder + "\nListing the contents of folder : " + 
      folder +"\n", os.listdir(folder))

All data files have been moved to folder : dataset
Listing the contents of folder : dataset
 ['image-predictions.tsv', 'twitter-archive-enhanced.csv']


In [3]:
# reading the twitter archive file
archive = pd.read_csv(folder+"/"+files_list[0])

# taking at random file entries for the archive file
archive.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1895,674742531037511680,6.7474e+17,4196984000.0,2015-12-10 00:08:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Some clarification is required. The dog is sin...,,,,,11,10,,,,,
1718,680176173301628928,,,2015-12-25 00:00:11 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This pupper is patiently waiting to scare the ...,,,,https://twitter.com/dog_rates/status/680176173...,10,10,,,,pupper,
2030,671789708968640512,,,2015-12-01 20:35:22 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is space pup. He's very confused. Tries t...,,,,https://twitter.com/dog_rates/status/671789708...,13,10,space,,,,
994,748568946752774144,,,2016-06-30 17:28:39 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cora. She rings a bell for treats. 12/...,,,,https://twitter.com/dog_rates/status/748568946...,12,10,Cora,,,,
2234,668190681446379520,,,2015-11-21 22:14:07 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Skittles. I would kidnap Skittles. Pin...,,,,https://twitter.com/dog_rates/status/668190681...,12,10,Skittles,,,,


2. [x] **Fetching the data from url and saving it to local drive - `image-predictions.tsv`**

In [4]:
# reading the file from internet using the requests library
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
res = req.get(url)

# saving the contents of the response variable to local drive
with open(os.path.join(folder+"/", url.split('/')[-1]), mode = "wb") as op_file:
    op_file.write(res.content)

In [5]:
# checking if fetched the data right way
img_pre_test = pd.read_csv(folder+"/"+files_list[1], delimiter="\t")
img_pre_test.sample(5)

# we did it the right way, Yay! it worked.

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1949,863062471531167744,https://pbs.twimg.com/media/C_o2vKCUwAAgtOp.jpg,2,French_bulldog,0.935804,True,pug,0.059576,True,boxer,0.001412,True
89,667534815156183040,https://pbs.twimg.com/media/CUOPYI5UcAAj_nO.jpg,1,Pembroke,0.435254,True,Cardigan,0.307407,True,cocker_spaniel,0.033158,True
975,707014260413456384,https://pbs.twimg.com/media/Cc_RsVlXEAIzzlX.jpg,1,Chihuahua,0.58378,True,Italian_greyhound,0.129683,True,toy_terrier,0.089153,True
412,673919437611909120,https://pbs.twimg.com/media/CVo-JuMWwAAet6F.jpg,1,jack-o'-lantern,0.172079,False,schipperke,0.115984,True,miniature_pinscher,0.052175,True
1676,813172488309972993,https://pbs.twimg.com/media/C0j4EESUsAABtMq.jpg,1,doormat,0.954844,False,golden_retriever,0.026193,True,cocker_spaniel,0.004386,True


4. [x] ***Final check if all our data files are in one place***

In [6]:
# list the directory for our dataset folder
os.listdir(folder)

# As expected, we have all of our required files :)

['image-predictions.tsv', 'twitter-archive-enhanced.csv']