In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Exploratory Data Analysis (EDA)

## Questions:
    1. What was the best month for sales? How much was earned that month?
    2. What time should we display advertisements to maximize the likelihood of purchases?
    3. Which category sold most in that six month period?
    4. Top 10 products sold most in that six month period?

In [None]:
df = pd.read_csv('/kaggle/input/filpkart-onlineorders/OnlineOrders_of_a_ecommerce_website.csv')
df.head()

# Data (Organization | cleaning | alteration ...)

In [None]:
# Using datetime
df['crawl_timestamp'] = pd.to_datetime(df['crawl_timestamp'])

In [None]:
# Sorting rows by date
df = df.reindex(df['crawl_timestamp'].sort_values().index)

In [None]:
# Starting from index 0
df.reset_index(drop=True, inplace=True)

In [None]:
df['crawl_timestamp'][8082] - df['crawl_timestamp'][0]

In [None]:
df.shape

I intend to keep the "clicks on the category" in order within a list

In [None]:
df['product_category_tree'][0]

In [None]:
def clear(x):
    """Removes some characters"""
    return x.replace('["', ' ').replace('"]', ' ').split('>>')

In [None]:
df['product_category_tree'] = df['product_category_tree'].apply(clear)

In [None]:
# Done!
df['product_category_tree'][0]

In [None]:
# Number of categories accessed
df['categories_clicked'] = df['product_category_tree'].apply(len)

In [None]:
df.head(8)

In [None]:
df['categories_clicked'].mean()

In [None]:
df['first_click'] = df['product_category_tree'].apply(lambda x: x[0])
df['last_click'] = df['product_category_tree'].apply(lambda x: x[-1])

In [None]:
df.head()

In [None]:
#Discount percentage
df['discount_pct'] = df['discounted_price']/(df['retail_price']/100)

# Data Visualization 

# #1. What was the best month for sales? How much was earned that month?

In [None]:
bym = df.groupby(by=df['crawl_timestamp'].dt.month_name())['retail_price'].mean().sort_values()
sns.barplot(bym.index, bym.values, palette=sns.cubehelix_palette(len(bym.values)))
plt.xticks(rotation=60)
plt.title("Average gain in retail prices - per month")
plt.show()

In [None]:
df.groupby(by=df['crawl_timestamp'].dt.month_name())['retail_price'].mean().sort_index().plot(style='-o', color='green')

# #2. What time should we display advertisements to maximize the likelihood of purchases?

In [None]:
df.groupby(by=df['crawl_timestamp'].dt.hour)['retail_price'].mean().plot.bar(color='orange', label='retail_price')
df.groupby(by=df['crawl_timestamp'].dt.hour)['discounted_price'].mean().plot.bar(color='blue', label='discounted_price')
df.groupby(by=df['crawl_timestamp'].dt.hour)['retail_price'].mean().plot(color='red', label='')

plt.legend()
plt.title("Average gain in retail prices - per hour")
plt.show()
#Decide for yourself

In [None]:
df.groupby(by=df['crawl_timestamp'].dt.hour)['retail_price'].sum().plot.bar(color='orange', label='retail_price')
df.groupby(by=df['crawl_timestamp'].dt.hour)['discounted_price'].sum().plot.bar(color='blue')
df.groupby(by=df['crawl_timestamp'].dt.hour)['retail_price'].sum().plot(color='red', label='')

plt.legend()
plt.title("Total retail price gain - per hour")
plt.show()

# #3. Which category sold most in that six month period?

In [None]:
category = df['last_click'].value_counts().nlargest(10).sort_values()
sns.barplot(category.values, category.index, palette=sns.cubehelix_palette(len(category.values)))
plt.show()

# #4. Top 10 products sold most in that six month period?

In [None]:
product = df['product_name'].value_counts().nlargest(10).sort_values()
sns.barplot(product.values, product.index, palette=sns.cubehelix_palette(len(category.values)))
plt.xticks(rotation=45, ha='right')
plt.show()

# MORE

# Which day of the week has the most earnings?

In [None]:
byday = df.groupby(by=df['crawl_timestamp'].dt.day_name())['retail_price'].mean().sort_values()
sns.barplot(byday.index, byday.values, palette=sns.cubehelix_palette(len(byday.values)))
plt.xticks(rotation=60)
plt.title("Average gain in retail prices - per day weekly")
plt.show()

# What does the customer look for?

In [None]:
fc = data=df['first_click'].value_counts().nlargest(10).sort_values()

sns.barplot(fc.values, fc.index,
            palette=sns.cubehelix_palette(len(fc.values)))

plt.title('First category look')
plt.show()