In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
etsy_data = pd.read_csv('../input/etsy-shops/etsy_shops_data.csv')
etsy_data.head()

**Visualize the missing data**

The missing data in shop_location is denoted as 'None', first replace it with np.nan. 
The sales_count and review_count also have missing data represented by -99. We'll replace those with nan as well so that we can see the holes in the data. 

In [None]:
etsy_data['shop_location'] = etsy_data['shop_location'].replace('None', np.nan)
etsy_data['sales_count'] = etsy_data['sales_count'].replace(-99, np.nan)
etsy_data['review_count'] = etsy_data['review_count'].replace(-99, np.nan)
msno.matrix(etsy_data)

Plot the correlatin matrix of the whole dataset

In [None]:
corr = etsy_data.corr()
plt.figure(num=None, figsize=(8, 8), dpi=80, facecolor='w', edgecolor='k')
corrMat = plt.matshow(corr, fignum = 1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.gca().xaxis.tick_bottom()
plt.colorbar(corrMat)
plt.title(f'Correlation Matrix for {filename}', fontsize=15)
plt.show()

In [None]:
#scatter_matrix(etsy_data.loc[0:1000,['listing_active_count', 'num_favorers', 'sales_count', 'review_count']], alpha=0.75, figsize=(10, 10), diagonal='kde')
sns.pairplot(etsy_data[0:10000], vars=['listing_active_count', 'num_favorers', 'sales_count', 'review_count'], hue="is_shop_us_based", diag_kind= 'kde', kind="reg")

Check out the distribution of number of active listings ('listing_active_count')

In [None]:
# Distribution of listings count
print(etsy_data.listing_active_count.describe())


Surprisingly, there are shops who have been opened for about a month and have up to 1749 active listings.

**Plot the histogram of the "listing_active_count" column[](http://) **

In [None]:
plt.figure(figsize=(10,6))
plt.hist(etsy_data.listing_active_count)
plt.xlabel('number of active listings')

Zoom into the high dense range:

In [None]:
plt.figure(figsize=(10,6))
plt.hist(etsy_data.listing_active_count, range=(0,50))
plt.xlabel('number of active listings')

Now, let's check out the correlation of 'listing_active_count', 'num_favorers', 'sales_count', 'review_count'.

In [None]:
etsy_data[['listing_active_count', 'num_favorers', 'sales_count', 'review_count']].corr()

The strongest correlations are between (reveiw_count and sales_count) as well as (review_count and num_favorers).

**Are the stores with higher number of listings have higher sales?**

There is no strong correlation, but, let's plot it.

In [None]:
plt.figure(figsize=(10,6))

sns.regplot(etsy_data.listing_active_count, etsy_data.sales_count)

In [None]:
# find the shops who have more than 500 sales
etsy_data.loc[etsy_data.sales_count>500]

Now, let's remove some of the outliers in terms of number of listings and sales_count and look at their correlation again. For this example, let's remove the listing_active_count>500 and sales_count>500. 

In [None]:
filtered_data = etsy_data.loc[(etsy_data.listing_active_count<100) & (etsy_data.sales_count<100)]
plt.figure(figsize=(10,6))
sns.regplot(filtered_data.listing_active_count, filtered_data.sales_count)

* **How does sales count change with number of followers of the shop and whether or not having a sale message (message sent to the buyer upon a purchase has an effect on the success of the shop?**

In [None]:
plt.figure(figsize=(10,6))
sns.lmplot('num_favorers', 'sales_count', hue='sale_message', data=etsy_data, height=5, aspect=1.5)

Sending a sale message does not seem to have a positive impact on the sales. Let's look at it in a different way with swarmplots.

In [None]:
plt.figure(figsize=(10,6))
sns.swarmplot(etsy_data.loc[etsy_data.sales_count>5].sale_message, etsy_data.loc[etsy_data.sales_count>5].sales_count)

Surprisingly, it looks like the shops who don't have a sale message had higher sales. 

Now, let's see the difference between the sales count of US-based shops vs. non-US-based shops using the swarmplot.

In [None]:
plt.figure(figsize=(10,6))
sns.swarmplot(etsy_data.loc[etsy_data.sales_count>30].is_shop_us_based, etsy_data.loc[etsy_data.sales_count>30].sales_count)