In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

In [None]:
from geopy.geocoders import Nominatim
import folium
import folium.plugins as plugins

# Take a Quick Look at the Data Structure

In [None]:
df = pd.read_csv('../input/sales-analysis/data.csv')
df.head()

In [None]:
def missing_zero_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = round(df.isnull().mean().mul(100),2)
    mz_table = pd.concat([mis_val,mis_val_percent],axis=1)
    mz_table = mz_table.rename(columns = {df.index.name:'col_name',0:'Missing Values',1:'% of Total Values'})
    mz_table['Data_type']=df.dtypes
    mz_table=mz_table.sort_values('% of Total Values',ascending=False)
    print("Your selected dataframe has ** "+str(df.shape[1])+" ** columns and ** "+str(df.shape[0])+" ** Rows.\n"
                 "There are ** "+str(mz_table[mz_table.iloc[:,1] != 0].shape[0])+
                  " ** columns that have missing values. \n")
    return mz_table.reset_index()

In [None]:
missing_zero_values_table(df)

Ok, we have missing value

In [None]:
lisCol = ['Quantity Ordered','Price Each','Order Date']
for col in lisCol:
    print(col,': ',df[col].unique(),'\n')

**Quantity Ordered** word in Quantity Ordered  
**Price Each** word in Price Each

In [None]:
df2 = df.copy()

# Exploratory data analysis and data cleaning

In [None]:
df2.dropna(how='all',inplace=True)

Remove missing values

In [None]:
filterQuantity = df2['Quantity Ordered'] == 'Quantity Ordered'
filterPrice = df2['Price Each'] == 'Price Each'

df2 = df2[~filterQuantity]
df2 = df2[~filterPrice]

Remove string values in numerical columns

In [None]:
missing_zero_values_table(df2)

Change types  
Quantity Ordered, Price Each , Order Date

In [None]:
df2=df2.astype({'Quantity Ordered':np.int64,'Price Each':np.float64})
df2['Order Date'] = pd.to_datetime(df2['Order Date'])

In [None]:
df2.dtypes

Total price for each product   
for example one person buy 2 butter and butter cost 2 dollar  
we must calculate and add in data frame total sales $4  

In [None]:
df2['salces'] = df2['Quantity Ordered'] * df2['Price Each']
df2.head()

Example : " 917 1st St, Dallas, TX 75001 "   
I want to extract city name   
and extract latitude and longitude

In [None]:
def city(x):
    return x.split(',')[1]

In [None]:
df2['city']=df2['Purchase Address'].apply(city)

In [None]:
locations = pd.DataFrame({'city':df2.city.unique()}) 

In [None]:
geolocation = Nominatim(user_agent='app')

In [None]:
lat_lon = []
for location in locations.city:
    location = geolocation.geocode(location)
    if location is None:
        lat_lon.append(np.nan)
    else:
        geo = (location.latitude,location.longitude)
        lat_lon.append(geo)

In [None]:
del location

In [None]:
locations['geo_loc'] = lat_lon
locations

In [None]:
np.array(locations.geo_loc)

In [None]:
lat,lon = zip(*np.array(locations.geo_loc))

In [None]:
locations['lat'] = lat
locations['lon'] = lon

In [None]:
locations.drop('geo_loc',axis=1,inplace=True)
locations

Merge to df2 , locations

In [None]:
df2 = df2.merge(locations,on='city',how='left').dropna()

In [None]:
df2.head()

Sales Analysis per month

In [None]:
px.bar(x=df2.resample('M', on='Order Date').salces.sum().index,
       y=df2.resample('M', on='Order Date').salces.sum(),
      title="Sales Analysis by month",
        labels={'y':'Total Sales','x':'Month'})

As expected, it had the most sales at the end of the year    
and in 2020 because we have low sales you must zoom it to understand 

Analysis sales in per days

In [None]:
px.line(x=df2.resample('D', on='Order Date').salces.sum().index,
        y=df2.resample('D', on='Order Date').salces.sum(),
        title="Analysis sales by days",
        labels={'y':'Total Sales','x':'Day'})

December in 2019 is the highest sales  
2020 don't have great start

Extract hour from Order Date column

In [None]:
df2['hour'] = df2['Order Date'].dt.strftime("%H")

In [None]:
fig=px.bar(x=df2.groupby('hour')['salces'].count().index,
       y=df2.groupby('hour')['salces'].count(),
      title="Sales Analysis by hours",
        labels={'y':'Total Sales','x':'Hours'},        
       text=df2.groupby('hour')['salces'].count())
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:
df2.head()

In [None]:
df2['day']=pd.DatetimeIndex(df2['Order Date']).day_name()

Analysis sale by hours in per days

In [None]:
hoursWeek = df2.groupby(['hour','day'])['salces'].sum().unstack().reset_index()

In [None]:
anchos = [0.2] * 6
fig = go.Figure()
fig.add_trace(go.Bar(y = hoursWeek['hour'], 
                     x = hoursWeek['Friday'],
                     width = anchos, name = 'Friday',
                     text = hoursWeek['Friday'],orientation='h'))

fig.add_trace(go.Bar(y = hoursWeek['hour'], 
                     x = hoursWeek['Sunday'],
                     width = anchos, name = 'Sunday',
                     text = hoursWeek['Sunday'],orientation='h'))

fig.add_trace(go.Bar(y = hoursWeek['hour'], 
                     x = hoursWeek['Tuesday'],
                     width = anchos, name = 'Tuesday',
                     text = hoursWeek['Tuesday'],orientation='h'))

fig.add_trace(go.Bar(y = hoursWeek['hour'], 
                     x = hoursWeek['Monday'],
                     width = anchos, name = 'Monday',
                     text = hoursWeek['Monday'],orientation='h'))

fig.add_trace(go.Bar(y = hoursWeek['hour'], 
                     x = hoursWeek['Wednesday'],
                     width = anchos, name = 'Wednesday',
                     text = hoursWeek['Wednesday'],orientation='h'))

fig.add_trace(go.Bar(y = hoursWeek['hour'], 
                     x = hoursWeek['Thursday'],
                     width = anchos, name = 'Thursday',
                     text = hoursWeek['Thursday'],orientation='h'))

fig.add_trace(go.Bar(y = hoursWeek['hour'], 
                     x = hoursWeek['Saturday'],
                     width = anchos, name = 'Saturday',
                     text = hoursWeek['Saturday'],orientation='h'))

fig.update_layout(title =  "Analysis sale by hours in per days",
                  barmode = 'group',title_font_size = 40,
                  width = 1600, height = 1400)

fig.update_layout(legend=go.layout.Legend(
            x=1,y=1,
            traceorder= "normal",
            font=dict(family="Verdana",size= 22, color = "black")))
fig.update_traces(texttemplate='%{text:.2f}',textposition='outside',
            textfont=dict(size=60, family='Verdana', color='black'))
fig.update_xaxes(title_text = 'Year',
           title_font=dict(size=30,family='Verdana',color='black'), 
           tickfont=dict(family='Calibri', color='darkred',size=25))
fig.update_yaxes(title_text = "Sales", 
           title_font=dict(size=30,family='Verdana',color='black'), 
           tickfont=dict(family='Calibri', color='darkred',size=25))

fig.show()

In [None]:
px.bar(hoursWeek,y='hour', 
             x=hoursWeek.columns,
             orientation='h')

Analysis sales by locations

In [None]:
def generatebasemap(default_location=[df2['lat'].mean(),df2['lon'].mean()],default_zoom=5,control_scale=True):
    basemap=folium.Map(location=default_location,zoom_start=default_zoom)
    return basemap

In [None]:
lat_lon = df2.groupby(['lat','lon']).size().index.tolist()
basemap = generatebasemap()
for i in range(len(df2.groupby(['lat','lon']).size().index.tolist())):
    folium.Circle([lat_lon[i][0],lat_lon[i][1]],150000,fill=True).add_child(folium.Popup(f'Total {df2[(df2.lat == lat_lon[i][0]) & (df2.lon == lat_lon[i][1]) ].shape[0]}')).add_to(basemap)


In [None]:
basemap

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
    go.Bar( 
    x = df2.groupby('city')['city'].count().index,
    y = df2.groupby('city')['city'].count(),
    name = 'Count'
    ),secondary_y=False,
)
fig.add_trace(
    
    go.Scatter(
    x = df2.groupby('city')['salces'].sum().index,
    y = df2.groupby('city')['salces'].sum(),
    name = 'Total sales'
    ),secondary_y=True,
)
fig.update_layout(
    title_text="Analysis sales by locations"
)

fig.update_xaxes(title_text="City Names")

fig.update_yaxes(title_text="<b>Count</b> yaxis title", secondary_y=False)
fig.update_yaxes(title_text="<b>Total sales</b> yaxis title", secondary_y=True)

fig.show()

It shows how many counts of city is go up sale is become high

Sale analysis by Product

In [None]:
fig=px.bar(x=df2.groupby('Product')['Quantity Ordered'].sum().index,
       y=df2.groupby('Product')['Quantity Ordered'].sum(),
      title="Sale analysis by Product",
        labels={'y':'Total Sales','x':'Product'},        
       text=df2.groupby('Product')['Quantity Ordered'].sum())
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

Battery has high sale  
let's find out why

In [None]:
product = df2.groupby('Product')['Quantity Ordered'].sum().index
quantity= df2.groupby('Product')['Quantity Ordered'].sum()
price = df2.groupby('Product')['Price Each'].mean()

In [None]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar( 
        x=product,
        y=quantity),secondary_y=False,
)

fig.add_trace(
    
    go.Scatter(
        x=product,
        y=price
    ),secondary_y=True,
)

fig.update_layout(
    title_text="Analysis price product"
)

fig.update_xaxes(title_text="Product")

fig.update_yaxes(title_text="<b>Count</b> yaxis title", secondary_y=False)
fig.update_yaxes(title_text="<b>Price</b> yaxis title", secondary_y=True)

fig.show()

As I guessed because the price is low has the highest sale

Analysis most popular product

In [None]:
df3=df2[df2['Order ID'].duplicated(keep=False)]
df3.head()

In [None]:
df3['Grouped']=df3.groupby('Order ID')['Product'].transform(lambda x:','.join(x))

In [None]:
df3.head()

In [None]:
df3 = df3.drop_duplicates(subset=['Order ID'])
df3.head()

In [None]:
px.pie(df, values=df3.Grouped.value_counts()[0:5], names=df3.Grouped.value_counts()[0:5].index)

Ok, these are the most popular product in this dataset

Work in Progress