# OBJECTIVES 

UPON INITIAL INSPECTION OF OUR DATA WE CAN IMMEDIATELY ASK A FEW QUESTIONS THAT ARE OF IMPORTANCE.

* What is the overall trend of the sales?
* What are the top 10 products by sales?
* What are the most selling products?
* Which is the most preferred Ship Mode?
* Which are the most profitable category and Sub-category?



# LIBRARIES

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import os
import time
from itertools import product
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn import base

In [None]:
data=pd.read_csv('../input/sales-forecasting/train.csv')

In [None]:
data.head()

# DATA CLEANING AND MANUPILATION 

In [None]:
data.shape

Columns


In [None]:
data.columns

In [None]:
data.info

In [None]:
data.isnull().sum()

In [None]:
clean=data.dropna()

In [None]:
clean.shape

In [None]:
clean.isnull().sum()

In [None]:
clean.describe()

# Multi-Table analysis of the data to help summarize the data fast

In [None]:
from IPython.core.display import HTML

def multi_table(table_list):
    ''' Accepts a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>')

In [None]:
clean.columns

In [None]:
clean_nunique = {var: pd.DataFrame(clean[var].value_counts()) 
              for var in {'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales'}}

In [None]:
multi_table([clean_nunique['Order ID'],clean_nunique['Order Date'],clean_nunique['Ship Date'],
            clean_nunique['Ship Mode'],clean_nunique['Customer ID'],clean_nunique['Customer Name'],clean_nunique['Segment'],
            clean_nunique['Country'],clean_nunique['City'],clean_nunique['State'],clean_nunique['Postal Code'],clean_nunique['Region'],
            clean_nunique['Product ID'],clean_nunique['Category'],clean_nunique['Sub-Category'],clean_nunique['Product Name'],clean_nunique['Sales']])

# INVESTIGATING THE TREND OF THE SALES 

In [None]:
clean.columns

In [None]:
clean['Order Date']

In [None]:
clean['Order Date'].max()

In [None]:
MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
LINEWIDTH=2
ALPHA=.6

dfp = clean[['Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales']].copy()

# Extract the year and the month from the date column into indepedent columns
dfp['Order Date']  = pd.to_datetime(dfp['Order Date'])
dfp['year_month']  = dfp['Order Date'].apply(lambda x : x.strftime('%Y-%m'))
dfp['year']  = dfp['Order Date'].dt.year
dfp['month'] = dfp['Order Date'].dt.month
dfp

# Grouping the data by months to get montly sales 

In [None]:
dfp_trend = dfp.groupby(['year','month']).sum()['Sales'].reset_index()

In [None]:
dfp_trend.head()

In [None]:
dfp_trend[dfp_trend.year==2015]

# SALES PER MONTH FOR EACH YEAR SINCE 2015 TILL 2018 

In [None]:
plt.figure(figsize=(16,6))
# Plot the sales of the year 2013
plt.plot(MONTHS, dfp_trend[dfp_trend.year==2015].Sales, '-o', color='steelblue', linewidth=LINEWIDTH, alpha=ALPHA,label='2015')
plt.plot(MONTHS, dfp_trend[dfp_trend.year==2016].Sales, '-o', color='seagreen', linewidth=LINEWIDTH, alpha=ALPHA,label='2016')
plt.plot(MONTHS, dfp_trend[dfp_trend.year==2017].Sales, '-o', color='pink', linewidth=LINEWIDTH, alpha=ALPHA,label='2017')
plt.plot(MONTHS, dfp_trend[dfp_trend.year==2018].Sales, '-o', color='blue', linewidth=LINEWIDTH, alpha=ALPHA,label='2018')

ax = plt.gca()
ax.set_title('Sales per month')
ax.set_ylabel('Sales in dollars')
ax.grid(axis='y', color='gray', alpha=.2)

for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.legend(loc=2, title='Legend')
plt.show()


# which are the top 10 products 

In [None]:
Product_Sales=pd.DataFrame(dfp.groupby('Product Name').sum()['Sales'])

In [None]:
Product_Sales.head()

# TOP TEN PRODUCTS BY SALES 

In [None]:
Sorted_Prod=Product_Sales.sort_values('Sales',ascending=False)

In [None]:
Sorted_Prod[:10].plot(kind='bar',figsize=(13,6),color='red')

# WHICH ARE THE MOST PERFORMING SEGMENTS 


In [None]:
sns.countplot(dfp['Segment'])

# MOST PREFFERED SHIP MODE 

In [None]:
sns.countplot(dfp['Ship Mode'])

# MOST SOLD PRODUCTS PER STATE 

In [None]:
STATE=pd.DataFrame(dfp.groupby(['State','Product Name']).sum()['Sales'])


In [None]:
STATE

In [None]:
Sorted_STATE=STATE.sort_values('Sales',ascending=False)

In [None]:
Sorted_STATE[:10].plot(kind='bar',figsize=(13,6),color='seagreen')

* HERE ARE THE MOST BOUGHT PRODUCTS PER STATE (TOP TEN PRODUCTS SOLD AND WHICH STATE THEY ARE SOLD) 

# THE MOST PROFITABLE CATEGORIES AND SUB-CATEGORIES 


In [None]:
Categ_and_Sub=pd.DataFrame(dfp.groupby(['Category','Sub-Category']).sum()['Sales'])

In [None]:
Sorted_Categs=Categ_and_Sub.sort_values('Sales', ascending=False)
Sorted_Categs.plot(kind='bar',figsize=(13,6),color='purple')

# THE MOST PROFITABLE CATEGORIES AND SUB-CATEGORIES PER STATE 

In [None]:
Categ_and_Sub2=pd.DataFrame(dfp.groupby(['Category','Sub-Category','State']).sum()['Sales'])

In [None]:
Categ_and_Sub2

In [None]:
Sorted_Categs2=Categ_and_Sub2.sort_values('Sales', ascending=False)
Sorted_Categs2=Sorted_Categs2[:10]
Sorted_Categs2.plot(kind='bar',figsize=(13,6),color='thistle')