In [None]:
#import the necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/sales-forecasting/train.csv') #Read the dataset

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

# Dropping useless columns

In [None]:
df1 = df.drop(['Row ID', 'Order ID', 'Customer ID', 'Product ID'], axis = 1)

df1.head()

# Checking for Null values


In [None]:
df1.isnull().sum()

We have found that there are some null values in our Postal Code Column

###### Let's find those rows

In [None]:
df1[df1["Postal Code"].isnull()]

###### As per google the postal code for the state Vermont is 5401 and let's fill it


In [None]:
df1['Postal Code'] = df1['Postal Code'].fillna(5401)

In [None]:
df1.isnull().sum()

###### For our Analysis purpose let's create a column month

In [None]:
df1["Month"]=df1["Order Date"].str[3:5]

In [None]:
# converting string to integer
df1["Month"]=df1["Month"].astype("int32")

df1

In [None]:
# Asigning the values for the integers
d = {1:"January",2:"February",3:"March",4:"April",5:"May",6:"June",7:"July",
    8:"August",9:"September",10:"October",11:"November",12:"December"}

df1["Month"]=[d[i] for i in df1["Month"]]

df1

In [None]:
# converting the date columns to pandas' datetime
df1["Order Date"] = pd.to_datetime(df1["Order Date"])

df1["Ship Date"] = pd.to_datetime(df1["Ship Date"])

df1

In [None]:
# creating a column year
df1["Year"] = df1["Order Date"].dt.year

df1

In [None]:
df1["Year"].value_counts() # checking for total count

# Analyzing and Visualizing our Data

#### To get valuable insights

### Sales based on Month

In [None]:
months_df = df1.groupby("Month").sum().sort_values("Sales", ascending = False)

months_df = months_df[["Sales"]]

months_df.reset_index(inplace=True)
months_df

In [None]:
plt.bar(months_df["Month"],months_df["Sales"], color = "#ffb861", edgecolor = "green", linewidth = .5)

plt.xlabel("Months", size = 20)

plt.ylabel("Sales", size = 20)

plt.xticks(fontsize = 10, rotation = 90)

plt.yticks(fontsize = 10)

plt.title("Sales based on Month", size = 25, pad = 25)

plt.show()

### Sales based on Year

In [None]:
year_df = df1.groupby("Year").sum().sort_values("Sales", ascending = False)

year_df = year_df[["Sales"]]

year_df

In [None]:
year_df.reset_index(inplace=True)

year_df

In [None]:
sns.barplot(x = "Year", y = "Sales", data  = year_df, palette = "summer_r")

plt.xlabel("Year", size = 20)

plt.ylabel("Sales", size = 20)

plt.title("Sales based on Year", size = 25, pad = 25)

plt.show()

### Most Valuable Customers

In [None]:
df1["Customer Name"].nunique()

In [None]:
customers = df1.groupby("Customer Name").sum().sort_values("Sales",ascending=False).head(20)

customers

In [None]:
customers.drop(["Postal Code", "Year"], axis = 1, inplace = True)

customers.reset_index(inplace = True)

customers

In [None]:
plt.figure(figsize = (10,5))

sns.barplot(x = customers["Customer Name"],y = customers["Sales"], palette = "YlOrRd_r")

plt.xlabel("Customer Name", size = 20)

plt.ylabel("Sales", size = 20)

plt.xticks(fontsize=12,rotation=90)

plt.yticks(fontsize=14)

plt.title("Our Most Valuable customers - Top 20", size = 25, pad = 25)

plt.show()

In [None]:
# The function which helps us to annotate the values in our donut or pie chart
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return  "${v:d}".format(p=pct,v=val)
    return my_autopct

### Sales based on Segment

In [None]:
df1.Segment.unique()

In [None]:
segment = df1.groupby("Segment").sum()

segment = segment[["Sales"]]

segment.reset_index(inplace = True)

segment

In [None]:
# Finding the total revenue generated by the superstore

total_revenue = segment["Sales"].sum() 

total_revenue = str(int(total_revenue))

total_revenue = "$" + total_revenue

In [None]:
total_revenue # this is our total revenue

In [None]:
# Creating a donut chart.

center = [5]
plt.pie(segment['Sales'], labels = segment['Segment'],colors =["#7CFC00", "#7FFF00", "#ADFF2F"], 
        radius = 1.5, explode = [0.05,0.05,0.05], autopct = make_autopct(segment["Sales"]))

plt.pie(center, radius = 1.25, colors = "w")

label = plt.annotate('Total Revenue \n' + total_revenue, color = 'red', xy=(0, 0)
                     ,fontsize=12, ha="center")
plt.tight_layout()

plt.title("The sales based on the Segment", pad = 60, size = 25, color = "green")
plt.show()

### Sales based on City

In [None]:
city = df1.groupby("City").sum().sort_values("Sales", ascending = False).head(15)

In [None]:
city = city[["Sales"]].round(2)

city.reset_index(inplace = True)

In [None]:
city

In [None]:
plt.figure(figsize = (15,5)) 

plt.title("Top 15 Cities which generated Highest Revenue", fontsize=18)

plt.bar(city["City"], city["Sales"],color= '#89ff61',edgecolor='blue', linewidth = 1)

plt.xlabel("Cities",fontsize=15)  

plt.ylabel("Revenue",fontsize=15)

plt.xticks(fontsize=12, rotation=90)

plt.yticks(fontsize=12)

plt.show()

### Sales Based on State

In [None]:
state = df1.groupby("State").sum().sort_values("Sales", ascending = False).head(10)

state = state[["Sales"]].round(2)

state.reset_index(inplace = True)

state

In [None]:
plt.figure(figsize = (15,5))

plt.title("Top 10 States which generated Highest Revenue", fontsize=18) 

plt.bar(state["State"], state["Sales"],color= '#FF6F61',edgecolor='black', linewidth = 1)

plt.xlabel("States",fontsize=15) # x axis shows the States

plt.ylabel("Revenue",fontsize=15) # y axis shows the Revenue

plt.xticks(fontsize=12, rotation=90)

plt.yticks(fontsize=12)

for a,s in state["Sales"].items():
    if v>400000:
        plt.text(a,s-150000,'$'+ str(v), fontsize=12,rotation=90,color='k', horizontalalignment='center');
    else:
        plt.text(a,s + 15000,'$'+ str(v), fontsize=12,rotation=90,color='k', horizontalalignment='center');

### Sales based on Region

In [None]:
region = df1.groupby("Region").sum()

region = region[["Sales"]].round(2)

region.reset_index(inplace = True)

region

In [None]:
plt.figure(figsize = (10,5)) 

plt.title("Region-wise Revenue Generation", fontsize=18)

plt.bar(region["Region"], region["Sales"],color= '#fff461',edgecolor='Red', linewidth = 1)

plt.xlabel("Region",fontsize=15) 

plt.ylabel("Revenue",fontsize=15) 

plt.xticks(fontsize=12, rotation=90)

plt.yticks(fontsize=12)

for a,s in region["Sales"].items(): 
        plt.text(a,s-150000,'$'+ str(v), fontsize=12,color='k', horizontalalignment='center');

### Sales based on Ship mode

In [None]:
shipmode = df1.groupby("Ship Mode").sum()

shipmode = shipmode[["Sales"]]

shipmode.reset_index(inplace = True)

shipmode

In [None]:
center = [6]
plt.figure(figsize = (6,6))

plt.pie(shipmode['Sales'], labels = shipmode['Ship Mode'],colors = ['#1E90FF','#00BFFF','#87CEEB',"mediumblue"], 
        radius = 2, autopct= make_autopct(shipmode["Sales"]))

plt.pie(center, radius = 1.7, colors = "w")

plt.title("The sales based on the Ship Mode", pad = 100, size = 25, color = "darkblue")

plt.tight_layout()
plt.show()

### Sales based on Categories

In [None]:
category = df.groupby(["Category"]).sum().sort_values("Sales", ascending=False) 

category = category[["Sales"]]

total_revenue_category = category["Sales"].sum()

total_revenue_category = str(int(total_revenue_category))

total_revenue_category = '$' + total_revenue_category

category.reset_index(inplace=True)

category

In [None]:
center = [6]
plt.pie(category['Sales'], labels = category['Category'],colors =  ["indianred","lightcoral", "darkred"], 
        radius = 1.5, autopct = make_autopct(category["Sales"]))

plt.pie(center, radius = 1.3, colors = "w")

plt.annotate('Total Revenue \n'+str(total_revenue_category),color = "purple", xy=(0, 0), fontsize=12, ha="center")

plt.title("The sales based on the Categories", pad = 50, size = 25, color = "darkred")

plt.tight_layout()
plt.show()

### Sales based on Sub-Categories

In [None]:
subca = df1.groupby("Sub-Category").sum().sort_values("Sales" , ascending = False).head()

subca = subca[["Sales"]].round(2)

subca.reset_index(inplace = True)

subca

In [None]:
plt.figure(figsize = (10,5)) 

plt.title("Top 5 Sub Categories Based on Revenue", fontsize=18)

plt.bar(subca["Sub-Category"], subca["Sales"],color= '#61ffe7',edgecolor='Red', linewidth = 1)

plt.xlabel("Sub - Category",fontsize=15) 

plt.ylabel("Revenue",fontsize=15) 

plt.xticks(fontsize=12, rotation=90)

plt.yticks(fontsize=12)

for a,s in subca["Sales"].items(): 
        plt.text(a,s-30000,'$'+ str(v), fontsize=12,color='k', horizontalalignment='center');

### Sales based on Products


In [None]:
df1["Product Name"].nunique()

In [None]:
prona = df1.groupby("Product Name").sum().sort_values("Sales", ascending = False).head()

prona = prona[["Sales"]].round(2)
prona.reset_index(inplace = True)

prona

In [None]:
center = [6]

explode = [0.05,0.05,0.05,0.05,0.05]
plt.pie(prona['Sales'], labels = prona['Product Name'], radius = 1.5, 
        colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99','#55B4B0'], 
        autopct = make_autopct(category["Sales"]), explode = explode)

plt.pie(center, radius = 1.3, colors = "w")

plt.annotate('Total Revenue \n'+str(total_revenue_category),color = "purple", xy=(0, 0), fontsize=12, ha="center")

plt.title("The Top 10 Products, which generated hig revenue", pad = 70, size = 25, color = "#9607f5")

plt.tight_layout()
plt.show()

### Choropleth Map

In [None]:
state = ['Alabama', 'Arizona' ,'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 
         'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
         'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana','Nebraska', 'Nevada', 'New Hampshire',
         'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
         'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
         'West Virginia', 'Wisconsin','Wyoming']
state_code = ['AL','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA',
              'MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN',
              'TX','UT','VT','VA','WA','WV','WI','WY']

In [None]:
state = pd.DataFrame(state, state_code) 

state.reset_index(inplace=True)

state.columns = ['State Code','State']

state.head()

In [None]:
sales = df.groupby(["State"]).sum().sort_values("Sales", ascending=False)

sales.reset_index(inplace=True) 

sales.drop('Postal Code',axis = 1, inplace = True)

sales.head(10)

In [None]:
sales= sales.sort_values('State', ascending=True)

sales.reset_index(inplace = True)

sales.head()

In [None]:
sales.drop('index',1,inplace = True)

sales.insert(1, 'State Code', state['State Code'])

sales.head()

In [None]:
import plotly.graph_objects as go

sales['states'] = sales['State']
usa = go.Figure(data=go.Choropleth(
    locations=sales['State Code'], # Spatial coordinates
    text=sales['states'],
    z = sales['Sales'].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'darkmint', #some cool colors sunsetdark, plotly3, mint, bluyl, darkmint, aggrnyl, algae, blues, blugrn, dense, emrld
    colorbar_title = "Sales",
    
))

usa.update_layout(
    title_text = 'State wise Sales',
    geo_scope='usa', # limit map scope to USA
)

usa.show();