In [None]:
import requests as rq
import pandas as pd
import numpy as np
import io
import json
from random import randrange

In [None]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

In [None]:
twitter_archive.head(15)

In [None]:
# twitter_archive[twitter_archive.in_reply_to_status_id.notnull()]
twitter_archive[twitter_archive.retweeted_status_user_id.notnull()]

In [None]:
twitter_archive.info()

In [None]:
r = rq.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
r.status_code

In [None]:
r.headers['content-type']

In [None]:
r.encoding

#### Load the received .tsv file into a dataframe

In [None]:
# load image predictions dataset from a local file
image_predictions = pd.DataFrame.from_csv('image-predictions.tsv', sep='\t')

In [None]:
# load image predictions dataset from the URL
image_predictions = pd.DataFrame.from_csv(io.StringIO(r.content.decode(r.encoding)), sep='\t')

In [None]:
image_predictions.head()

In [None]:
image_predictions.info()

### Merge twitter archive and image predictions

In [None]:
merged_df = pd.merge(twitter_archive,image_predictions,how='inner',on='tweet_id')

In [None]:
merged_df.head()

## Dataset 3: Twitter via API

### Set up Twitter API via tweepy

In [1]:
import tweepy

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())
api.wait_on_rate_limit = True

ModuleNotFoundError: No module named 'tweepy'

### Download WeRateDogs Twitter archive. Takes around 30 mins.

In [None]:
# create an empty array to store dictionaries retrieved via API
tweets = []
missing_tweets = []

# use tweet_id's from our dataframe to retrieve original tweets
for i in merged_df.tweet_id:
    try:
        tweets.append (api.get_status(i, tweet_mode='extended'))
    except:
        missing_tweets.append(i)
        print ('tweet #', i, ' could not be located')
            
# write downloaded tweets to a json file and store it locally            
with open('tweets.json', 'w') as outfile:
    json.dump(tweets, outfile)

In [None]:
missing_tweets

### Read the downloaded and saved archive from a local .json file

In [None]:
tweets = pd.read_json ('tweets.json')

### Merge tweets with two previous datasets

In [None]:
# rename id columns to identical names
tweets.rename(columns={'id':'tweet_id'},inplace=True)

# merge two dataframes on the 'tweet_id' column
we_rate_dogs = pd.merge(merged_df,tweets, how='inner', on='tweet_id')

# Step 2. Assess data

In [None]:
we_rate_dogs.head()

In [None]:
we_rate_dogs.info()

In [None]:
print (len(we_rate_dogs.columns))

## 1. Duplicate, zero and redundant data
First let's assess whether there are any columns that duplicate data, contain no data whatsoever or that are irrelevant for the purposes of our analysis. Let's make empty lists which we will populate over the course of our assessment and drop remove them from the dataframe.

In [None]:
duplicate_columns = []
zero_columns = []
redundant_columns = []

Summing data in columns highlights some of the empty variables.

In [None]:
we_rate_dogs.sum(axis=0)

In [None]:
# make list of all zero-value columns
zero_columns.extend(['contributors','coordinates','favorited','geo','is_quote_status','possibly_sensitive','possibly_sensitive_appealable','retweeted','truncated'])

It seems that there is only one record in the 'place' column. What is it?

In [None]:
we_rate_dogs[we_rate_dogs.place.notnull()].place

Let's store this place in a separate variable and remove the column.

In [None]:
wrd_place = we_rate_dogs.iloc[686]
zero_columns.append('place')

In this analysis we will not engage in understanding social context of this account, therefore all information pertaining to retweets and replies can be considered redundant.

In [None]:
redundant_columns.extend(['in_reply_to_status_id_x', 'in_reply_to_user_id_x',
'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp',
'in_reply_to_screen_name','in_reply_to_status_id_y', 'in_reply_to_status_id_str',
'in_reply_to_user_id_y', 'in_reply_to_user_id_str','retweeted_status'])

In [None]:
# are source_x and source_y columns identical?
# sum the number of times records differ and subtract from this the length of the whole dataset
# '0' would indicate that all the records in these columns are identical
print (sum(we_rate_dogs.source_x == we_rate_dogs.source_y) - len(we_rate_dogs))

In [None]:
duplicate_columns.append('source_y')

In [None]:
duplicate_columns

In [None]:
we_rate_dogs.drop(columns=duplicate_columns, inplace=True)
we_rate_dogs.drop(columns=zero_columns, inplace=True)
we_rate_dogs.drop(columns=redundant_columns,inplace=True)

In [None]:
print ('number if columns in the dataset: ', len(we_rate_dogs.columns))

In [None]:
we_rate_dogs.info()

### 2. Incorrect data types.

Some of the columns have 

### Define 
`display_text_range` column can be effectively reduced to a single `int` rather than a list.

In [None]:
sum(we_rate_dogs.display_text_range.apply(lambda x: 1 if x[0] > 0 else 0))

#### Code

extract the relevant value, put it into a new column and drop the old column

In [None]:
we_rate_dogs['display_text_end'] = we_rate_dogs.display_text_range.apply(lambda x: x[1]).astype(int)
we_rate_dogs.drop(columns='display_text_range', inplace=True)

#### Test
check if our new colum is of `int64` type

In [None]:
print (type(we_rate_dogs.display_text_end[0]))
we_rate_dogs.head()

The `user` field contains a number of interesting fields. Let's unpack some of those into our dataframe.

### Define
Unpack `followers_count` field from `user` into separate column

#### Code
Using `apply` function - unpack the values.

In [None]:
we_rate_dogs['followers'] = we_rate_dogs.user.apply(lambda x: x['followers_count'])

#### Test
Check the new column is in place and that it's `int64` type.

In [None]:
we_rate_dogs.info()

### Define
Unpack `favourites_count` from within `user` field into a separate column.

In [None]:
we_rate_dogs['total_favourites'] = we_rate_dogs.user.apply(lambda x: x['favourites_count'])

In [None]:
print (we_rate_dogs['total_favourites'].min())
print (we_rate_dogs['total_favourites'].mean())
print (we_rate_dogs['total_favourites'].max())

### Define

There should be a limited set of appications used to post tweets.

In [None]:
# how many various sources were used to post tweets?
we_rate_dogs.source_x.unique()

#### Code
This kind of data is best represented as a categorical variable.

In [None]:
we_rate_dogs.source_x = we_rate_dogs.source_x.apply(lambda x: x.split('>')[1].split('<')[0]).astype('category')

####  Test
Check unique values in the column.

In [None]:
we_rate_dogs.source_x.unique()

### Define
Hashtags are nested deep inside dictionaries. To be useful for analysis - they need to be extracted into a separate columnb

#### Code

extract hashtags using the `apply` method, put them in a separate column and cast them into `category` type variable.

In [None]:
we_rate_dogs['hashtags'] = we_rate_dogs.entities.apply(lambda x: x['hashtags'])

In [None]:
def unpack_hashtags(x):
    try:
        return x[0]['text']
    except:
        return None
        
we_rate_dogs.hashtags = we_rate_dogs.hashtags.apply(lambda x: unpack_hashtags(x))

In [None]:
we_rate_dogs.hashtags.unique()

With this few hashtags they are better as a 'category' type variable.

In [None]:
we_rate_dogs.hashtags = we_rate_dogs.hashtags.astype('category')

#### Test
Check that the `hashtags` colummn is a `category` type.

In [None]:
we_rate_dogs.info()

### Define

Same as hashtags - `url`s are nested deep inside dictionaries. They need to be extracted into a separate column. 

#### Code
Using `apply` function with a short lambda expression - extract the `url` into a new column.

In [None]:
we_rate_dogs['url'] = we_rate_dogs.entities.apply(lambda x: x['media'][0]['url'])

#### Test
Check the new `url` column has the right information.

In [None]:
we_rate_dogs.head()

In [None]:
we_rate_dogs.info()

### Define

`timestamp` and `crated_at` columns are probably identical. If that's the case - one needs to be removed.

#### Code

In [None]:
# first - convert timestamp into datetime format
we_rate_dogs.timestamp = pd.to_datetime(we_rate_dogs.timestamp)

#### Test

In [None]:
# strip both fileds of time zone
we_rate_dogs['timestamp'] = we_rate_dogs['timestamp'].apply(lambda x: x.replace(tzinfo=None))
we_rate_dogs['created_at'] = we_rate_dogs['created_at'].apply(lambda x: x.replace(tzinfo=None))

In [None]:
# subtract one from the other and sum the differences to see if there's anything different between the two
(we_rate_dogs['timestamp'] - we_rate_dogs['created_at']).unique()

In [None]:
# drop the created_at column
we_rate_dogs.drop(columns='created_at',inplace=True)

In [None]:
we_rate_dogs.info()

### Define

`doggo`, `floofer`, `pupper`, `puppo` columns are better represented as `bool`s.

#### Code

In [None]:
def to_bool(x):
    if x=='None':
        return False
    else:
        return True
    
we_rate_dogs.doggo = we_rate_dogs.doggo.apply(to_bool)
we_rate_dogs.floofer = we_rate_dogs.floofer.apply(to_bool)
we_rate_dogs.pupper = we_rate_dogs.pupper.apply(to_bool)
we_rate_dogs.puppo = we_rate_dogs.puppo.apply(to_bool)

#### Test
Check if the columns have correct data types in them.

In [None]:
we_rate_dogs.info()

### Define
Convert `lang` column into category.

#### Code

In [None]:
we_rate_dogs.lang = we_rate_dogs.lang.astype('category')

### Test

In [None]:
we_rate_dogs.info()

## Analysis part

In [None]:
print ('start date: ', we_rate_dogs.timestamp.min())
print ('end date: ', we_rate_dogs.timestamp.max())
duration = we_rate_dogs.timestamp.max()- we_rate_dogs.timestamp.min()
print ('duration: ', duration)
print ('average tweets per day: ', len(we_rate_dogs)/duration.days)

In [None]:
type(duration.days)

In [None]:
retweets = twitter_archive[twitter_archive.retweeted_status_timestamp.notna()]
retweets.iloc[7].expanded_urls

In [None]:
urls = twitter_archive.iloc[295].expanded_urls.split(',')

In [None]:
set(urls)

In [None]:
twitter_archive.expanded_urls = twitter_archive.expanded_urls.apply(lambda x: list(set(str(x).split(','))))

In [None]:
for i in twitter_archive.expanded_urls:
    if len(i) > 1: print (i)

In [None]:
twitter_archive.expanded_urls[0]

In [None]:
twitter_archive.iloc[295]