In [None]:
# Import required liberaries: 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sb
import json
import requests

# **Gathering Data**

In [None]:
# Import main twitter archive dataset:
twitter_archive = pd.read_csv('../input/weratedogs-twitter-archive/twitter-archive-enhanced.csv')
twitter_archive.shape

In [None]:
# Import twitter_json.txt and read it line by line to extract tweet ID,
#retweet count, and favorite count:
data_list=[]
df_list=[]
with open('../input/weratedogs-twitter-archive/tweet_json.txt' , 'r') as myfile:
    for line in myfile:
        json_file = json.loads(line)
        data_list.append(json_file)
        
for i in data_list:
    tweet_id = i['id']
    retweet_count = i['retweet_count']
    favorite_count = i['favorite_count']
    df_list.append({'id':tweet_id , 'retweet_count': retweet_count , 'favorite_count': favorite_count})
    
# Create a data Frame from Json extracted data: 
json_df = pd.DataFrame(df_list , columns = ['id' , 'retweet_count' , 'favorite_count'])

json_df.shape
        



In [None]:
w_page = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
with open('image_predictions.tsv' , 'wb') as myfile:
    myfile.write(w_page.content)

    
img_pred = pd.read_csv('image_predictions.tsv' , '\t')

img_pred.shape
    

# **Data Assesment & Cleaning**

In [None]:
# Show out the first two rows of twitter_archive dataframe:
twitter_archive.head(2)

In [None]:
# Show out the first two rows of img_pred dataframe:
img_pred.head(2)

In [None]:
# Show out the first two rows of json_df dataframe:
json_df.head(2)

In [None]:
# Create a copy from each data frame to be used in cleaning processes and keep original one as it is:
twitter_archive_clean = twitter_archive.copy()
img_pred_clean = img_pred.copy()
json_df_clean = json_df.copy()


In [None]:
# Determine all records for non dogs from img_pred data frame :
non_dogs = img_pred[img_pred['p1_dog']== False].index

# Drop all records for non dogs from img_pred data frame :
img_pred.drop(non_dogs, inplace = True)


In [None]:
# Test that all non dogs records have been removed:
# Note that there is no records where p1_dog == False
# P1_dog identfy whether this image for dog or not as per data description
img_pred[img_pred['p1_dog'] == False]

In [None]:
# Check whether number of tweets in twitter_archive == number of tweets that have images in img_pred:
# As shown that there are some tweets in twitter_archive that has no images in img_pred
# I believe theses without images tweets will not be useful in further analysis
# As rating of dogs in each tweet mainly depends on image.

twitter_archive_clean['tweet_id'].size == twitter_archive_clean.isin(img_pred['tweet_id']).sum()['tweet_id']

In [None]:
# filter twitter_archive to only have tweets having images in img_pred dataframe:
twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['tweet_id'].isin(img_pred['tweet_id'].unique())]
twitter_archive_clean.shape

In [None]:
# Douple check that all tweets in twitter_archive have images in img_pred:

twitter_archive_clean['tweet_id'].size == twitter_archive_clean['tweet_id'].isin(img_pred['tweet_id']).size

In [None]:
# Filter twitter_archive to only have original tweets and remove all retweets:
twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['retweeted_status_id'].isnull()]

In [None]:
# Test that all tweets in twitter_archive never have retweet_id; means they are original tweets
# Note that retweet_id include only unique value "NAN"
twitter_archive_clean['retweeted_status_id'].unique()

In [None]:
# Show out columns' names for twitter archive:
twitter_archive_clean.columns

In [None]:
# Remove unnecessary columns that will not be used in further analysis 
# such as : [in-reply-to-status-id , retweet_id]

drop_list = ['in_reply_to_status_id' , 'in_reply_to_user_id' , 'source' , 'retweeted_status_id',
             'retweeted_status_user_id','retweeted_status_timestamp' ,'expanded_urls','name' ]

twitter_archive_clean.drop(drop_list , inplace =True , axis=1)

In [None]:
# Tets that al unnecessary columns have been removed: 
# Note that num. of columns became 10 instead of 17 (7 unnecessary columns have been removed)

twitter_archive_clean.shape

In [None]:
# Show out twitter archive data frame after dropping unnecessary columns:
twitter_archive_clean.head()

In [None]:
# Check unique values for stage columns:
# As shown "None" in case it is not belong to this stage and "doggo" in case it belongs to thsi doggo stage
twitter_archive['doggo'].unique()

In [None]:
# Repace "None" values in dog stage ['doggo', 'floofer', 'pupper', 'puppo']  columns by balnk text'':
twitter_archive_clean[['doggo', 'floofer', 'pupper', 'puppo']]= twitter_archive_clean[['doggo', 'floofer', 'pupper', 'puppo']].replace('None' , '')

In [None]:
# Douple check for unique values for dog stage columns 'doggo', 'floofer', 'pupper', 'puppo':
# Note that "None" has been replaced by Nan
twitter_archive_clean['doggo'].unique()

In [None]:
# Dog stage columns ['doggo', 'floofer', 'pupper', 'puppo'] in twitter_archive clean should be merged in 
# One column called 'Stage' ---> one variable should be represented in one column to be tidy

twitter_archive_clean['stage'] = twitter_archive_clean['doggo'] +twitter_archive_clean['floofer'] + twitter_archive_clean['pupper'] + twitter_archive_clean['puppo']

In [None]:
# Test that stage column has been created and check unique values within it : 
twitter_archive_clean['stage'].unique()

In [None]:
# Replace blank text value in stage column by np.nan:
twitter_archive_clean['stage'] = twitter_archive_clean['stage'].replace('', np.nan)

In [None]:
# Test that there is no blank text in stage column and all have been replaced by np.nan:
# Note there are no records with blank text

twitter_archive_clean[twitter_archive_clean['stage']== '']

In [None]:
# Drop ['doggo', 'puppo', 'pupper', 'floofer'] columns as they are not useful in further analysis:
to_drop_list = ['doggo', 'puppo', 'pupper', 'floofer']
twitter_archive_clean.drop(to_drop_list , axis=1 , inplace= True)

In [None]:
# Check that ['doggo', 'puppo', 'pupper', 'floofer'] columns have been removed:
# Note that columns describe stage have been removed and replaced by "stage" column 

twitter_archive_clean.head(2)

In [None]:
# Show out a summary for twitter_archive_clean:
twitter_archive_clean.info()

In [None]:
# Statistical summary for rating_ denominator:
twitter_archive_clean['rating_denominator'].describe()

In [None]:
# Show out records where rating_denominator != 10: 
# Note that most of them are a group of dogs not one dog except record (2335)
# Note that tweet 516 has no rating in original text
# So, we drop them as analysis dedicate for tweets created for single dog not a group
twitter_archive_clean[twitter_archive_clean['rating_denominator'] != 10][['text','rating_numerator', 'rating_denominator' ]]

In [None]:
# Let's fix record (2335) firstly:
# Show out text for record (2335) , it seems that this tweet fro a dog bur rate is inaccurate
# So, we fix this denominator value
twitter_archive_clean[twitter_archive_clean['rating_denominator'] != 10].loc[2335,'text']

In [None]:
# Fix denominator and numerator values for recore (2335)
twitter_archive_clean.loc[2335 , 'rating_denominator'] = 10
twitter_archive_clean.loc[2335 , 'rating_numerator'] = 9

In [None]:
# Douple check that record(2335) has been fixed:
twitter_archive_clean.loc[2335,:]

In [None]:
# Identify all tweets that have "rating_denominator" != 10 
wrong_denominator= twitter_archive_clean[twitter_archive_clean['rating_denominator'] != 10].index

# Drop all tweets that have "rating_denominator" != 10 
twitter_archive_clean.drop(wrong_denominator , inplace = True )

In [None]:
# Check that all tweets that have rating_denominator != 10 have been removed:
# Note that there is no records where rating_denominator != 10
twitter_archive_clean[twitter_archive_clean['rating_denominator'] != 10]

In [None]:
# Check "rating_numerator"  statistical summary
#as it strongly influence our future analysis:

twitter_archive_clean['rating_numerator'].describe()

In [None]:
# Show out tweets with rating_numerator more than 15:
twitter_archive_clean[twitter_archive_clean['rating_numerator'] > 15]

In [None]:
# Show out text for tweets that have rating_numerator 75 (index - 695):
twitter_archive_clean[twitter_archive_clean['rating_numerator'] > 15].loc[695,'text']

In [None]:
# fix rating numerator for previous tweet (index 695):
twitter_archive_clean.loc[695 , 'rating_numerator'] = 9.75

In [None]:
# Show out text for tweets that have rating_numerator 27 (index - 763):
twitter_archive_clean[twitter_archive_clean['rating_numerator'] > 15].loc[ 763,'text']

In [None]:
# fix rating numerator for previous tweet (index 763):
twitter_archive_clean.loc[763 , 'rating_numerator'] = 11.27

In [None]:
# Show out text for tweets that have rating_numerator 27 (index - 1712):
twitter_archive_clean[twitter_archive_clean['rating_numerator'] > 15].loc[1712,'text']

In [None]:
# fix rating numerator for previous tweet (index 1712):
twitter_archive_clean.loc[1712 , 'rating_numerator'] = 11.26

In [None]:
# Check that tweets having rating_numerator more than 15 has been fixed:
# Note that there is no tweets having rating_numerator more than 15 any more.
twitter_archive_clean[twitter_archive_clean['rating_numerator'] > 15]

In [None]:
# Check again statistical summary for rating_numerator:
twitter_archive_clean['rating_numerator'].describe()

In [None]:
# Drop rating_denominator column as it will not be useful in any future analysis:
twitter_archive_clean.drop('rating_denominator' , axis= 1, inplace= True)

In [None]:
# Check that rating_denominator column has been removed:
twitter_archive_clean.info()

In [None]:
# show up columns' names of img_pred data frame:
img_pred_clean.columns

In [None]:
img_pred[img_pred['img_num']==2]

In [None]:
# Remove unnecessary columns [jpg_url, 'p2','p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog']:
to_drop_cols = ['jpg_url','img_num', 'p2','p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog']
img_pred_clean.drop(to_drop_cols , axis=1 , inplace =True)

In [None]:
# Test that all unnecessary columns has been removed from img_pred_clean data frame:
img_pred_clean.columns

In [None]:
# Rename columns of img_pred data frame to be more descriptive:
new_names = {'p1': 'breed' , 'p1_conf': 'pred_confidence' , 'p1_dog': 'is_dog' }
img_pred_clean.rename(columns= new_names , inplace =True)

In [None]:
# Check that column names for img_pred_clean have been changed:
img_pred_clean.columns

In [None]:
#show up summary for json_df data frame:
json_df_clean.info()

In [None]:
# Fix name for "id" column in json_df to "tweet_id" to be consistent with other two data frames while merge:
json_df_clean.rename(columns={'id': 'tweet_id'} , inplace = True)

In [None]:
# Check that name of "id" column in json_df has been changed to "tweet_id":
json_df_clean.columns

In [None]:
# Merge twitter_archive_clean with img_pred_clean:
df = twitter_archive_clean.merge(img_pred_clean ,on='tweet_id' ,  how= 'inner')

# Merge previous df with json_df_clean to create master df that will be used in our analysis:
df_master = df.merge(json_df_clean , on= 'tweet_id' , how='inner')

In [None]:
# Test df_master by showing out samples:
df_master.sample(5)

# **Data Sorting**

In [None]:
# Save our cleaned master data frame as CSV. file: 
df_master.to_csv('master_df_weratedogs.csv' , index= False)

# show up a sample of cleaned master data set: 
df_master.head()


In [None]:
df_master.info()

In [None]:
df_master['breed'].value_counts()[0:10]

# **Which breed is most common in this twitter archive !?**
**Golden_Retriever** is the most common blreed in this twitter archive (There's 135 tweet for golden_retriever) .

In [None]:
# Plot most common 10 breeds:
plt.figure(figsize=(10,6))
df_master['breed'].value_counts()[0:10].sort_values(ascending=True).plot(kind='barh' , color ='purple')

# Set x_label, y_label and title:
plt.xlabel('count',size=12)
plt.ylabel('breed', size=12)
plt.title('Dog\'s Breed VS Count' , size=12)

# Set x & y positions to add text over each bar:
x = df_master['breed'].value_counts()[0:10].sort_values(ascending=True).values
y= np.arange(0,10,1)

# Add text (showing value) over each bar:
for xx, yy in zip(x,y):
    plt.text(xx+1,yy,xx , va='center')



In [None]:
# Plot a histogram to show up distrbution of rating_numerator:
# As shown below most of dogs get rating ranges from 8 to 14 and only few dogs get rating less than 8
df_master['rating_numerator'].hist()
plt.xlabel('Rating_Numerator')
plt.ylabel('Distribution');

# **Is low rates associated with particlular breed !?**
No, Low rate is not associated with particular breed; There are diffrent breeds (few number of dogs in these breeds) got rate less than 8. 

In [None]:
# Let's check dogs having rating less than 8 :
less_8 = df_master[df_master['rating_numerator'] < 8]['breed'].value_counts()
less_8


In [None]:
top_10_rating = df_master.groupby('breed')['rating_numerator'].mean().sort_values(ascending= False)[0:10]
top_10_rating

# **Which breed got the highest rating average !? **
- **saluki** got the highest rating average (12.5) 

In [None]:
plt.figure(figsize=(10,6))
top_10_rating.plot(kind='bar' , color='red')
plt.xticks(rotation= 55 , size= 12)
plt.xlabel('Average Rating' , size=10)
plt.ylabel('breed' , size= 12)
plt.title('Dog\'s Breed VS. Average Rating');

y= df_master.groupby('breed')['rating_numerator'].mean().sort_values(ascending= False).round(2)
x= np.arange(0,10,1)

for xx,yy in zip(x,y):
    plt.text(xx, yy+.1 , yy , ha='center')

In [None]:
# Show out top 10 breeds having highest average numbers of retweets and favorites in twitter archive:
top_retweets = df_master.groupby('breed')['retweet_count'].mean()[0:10].sort_values(ascending=False)
top_favorites = df_master.groupby('breed')['favorite_count'].mean()[0:10].sort_values(ascending=False)


# **Which Breed Got the highest average number of retweets & favorites !?**
**Afghan_hound**  got the highest average number of retweets & favorites (5976 average retweets)

In [None]:
figure= plt.figure(figsize=(13,9))

plt.subplot(2,1,1 )
plt.bar(top_retweets.index, top_retweets.values , color='purple')
plt.ylabel('Average Retweets count' , size= 12)
plt.title('Dog\'s Breed VS, Average Retweets count')
plt.xticks(rotation =45)


plt.subplot(2,1,2)
plt.bar(top_favorites.index , top_favorites.values)
plt.ylabel('Average Favorites count' , size= 12)
plt.xlabel('breeds' , size= 12)
plt.title('Dog\'s Breed VS, Average favorites count')
plt.xticks(rotation = 45)

figure.tight_layout(pad=5)