In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import plotly.graph_objs as go
from plotly.offline import iplot

# Использование cufflinks в офлайн-режиме
import cufflinks
cufflinks.go_offline()

# Настройка глобальной темы cufflinks
cufflinks.set_config_file(world_readable=True, theme='pearl', offline=True)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Data looks like

In [None]:
data = pd.read_csv('../input/onlineretail/OnlineRetail.csv', encoding='ISO-8859-1')
data.head()

### Chech how many missing values we have

In [None]:
data.isnull().sum()

### Getting description about data

In [None]:
data.info()

### Looking at Quantity and UnitPrice colimns. There we have negative numbers which can't be used for analysis.

In [None]:
print('Quantity column')
print(data.Quantity.describe())
print('UnitPrice column')
print(data.UnitPrice.describe())

### Getting rid of negative numbers in Quantity and UnitPrice columns

In [None]:
data = data[(data.Quantity>0)&(data.UnitPrice>0)]

### We transform InvoiceDate to datetime format.
### And getting rid of rows with missing Customer IDs

In [None]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data.dropna(subset=['CustomerID'])

### Checking missing dates in our sales

In [None]:
from datetime import datetime, date
datelist = pd.date_range(start="2010-12-01",end="2011-12-09")
for i in data['InvoiceDate']:
    i = datetime.date(i)
    if i not in datelist:
        print(i)

### Creating column with total revenue of every order.
### Creating columns for year and month when order is made

In [None]:
data['TotalPrice'] = data['Quantity']*data['UnitPrice']
data['Year'] = pd.DatetimeIndex(data['InvoiceDate']).year
data['Month'] = pd.DatetimeIndex(data['InvoiceDate']).month

### Getting knownledge about amount of invoices per month. The most active month is November.

In [None]:
data.groupby(['Year', 'Month']).InvoiceNo.count().plot(kind='bar', title='Amount of invoices per month')

### Getting knowledge about amount of customers per month. It looks like previous graph.

In [None]:
data.groupby(['Year', 'Month']).CustomerID.count().plot(kind='bar', title='Amount of customers per month')

### Getting knowledge about Total revenue per month. The best one is November (it is expected, becouse November was the most active month for sales)

In [None]:
px.bar(data[['InvoiceDate','TotalPrice']].set_index('InvoiceDate').resample('M').sum().reset_index(),
       x='InvoiceDate', y='TotalPrice', title = 'Total Revenue per month')

In [None]:
a=data[data.Year==2011].groupby('Month').InvoiceNo.nunique().reset_index()
b=data[data.Year==2011].groupby('Month').TotalPrice.sum().reset_index()
a=a.merge(b, right_on='Month', left_on='Month', how='inner')
px.scatter(a, x='InvoiceNo', y='TotalPrice', hover_data=['Month'], title = 'Amount of invoices per month and total revenue distribution')

### TOP best 20 countries by revenue 

In [None]:
px.pie(data.groupby('Country').TotalPrice.sum().reset_index()[:20], values='TotalPrice', names='Country', 
      title='TOP BEST 20 COUNTRIES BY SALES')

### TOP worst 20 countries by revenue

In [None]:
px.pie(data.groupby('Country').TotalPrice.sum().reset_index()[20:], values='TotalPrice', names='Country', 
      title='THE WORST 20 COUNTRIES BY SALES')

### Revenue by week

In [None]:
px.bar(data[['InvoiceDate','TotalPrice']].set_index('InvoiceDate').resample('W').sum().reset_index(),
       x='InvoiceDate', y='TotalPrice')

In [None]:
data['Hour'] = data['InvoiceDate'].dt.hour
data['WeekDay']=data['InvoiceDate'].dt.weekday
data['WeekDay'] = data['WeekDay'].replace({0:'Mon', 1:'Thu',2:'Wed', 3:'Thur', 4:'Fri', 5:'Sat', 6:'Sun'})

### Find the best and the worst weekdays by sales. The best is Thursday and the worst is Sunday.

In [None]:
px.bar(data.groupby('WeekDay').TotalPrice.sum().reset_index(), x='WeekDay', y='TotalPrice')

### Find the best time for sales. It is 12 o'clock.

In [None]:
px.bar(data.groupby('Hour').TotalPrice.sum().reset_index(), x='Hour', y='TotalPrice')

# Summary
## * We should continue cooperation with EIRE, Germany, France, Belgium and change relationships with Saudi Aravia and RSA
## * The best sales moth in 2011 was November
## * The worst sales month in 2011 were March and May
## * The best weekday for sales is Thursday and the worst is Sunday
## * The best time for sales is 12 o'clock and near it