# Sales Analysis

## Importing the Necessary Libraries

In [1]:
import pandas as pd
import os

## Task - 1

### Merge the 12 months of sale data into a single `.csv` file

In [2]:
ALL_MONTHS_DATA_FILE_NAME = "all_months_data.csv"

PATH_TO_ALL_SALES_DATA = os.path.join(".", "Datasets", "Sales_Data")

# Get a list of the .csv files in the PATH_TO_ALL_SALES_DATA folder
LIST_OF_MONTLY_SALES_DATA_FILES = os.listdir(PATH_TO_ALL_SALES_DATA)

if ALL_MONTHS_DATA_FILE_NAME in LIST_OF_MONTLY_SALES_DATA_FILES:
    LIST_OF_MONTLY_SALES_DATA_FILES.remove(ALL_MONTHS_DATA_FILE_NAME)

LIST_OF_MONTLY_SALES_DATA_FILES

['Sales_June_2019.csv',
 'Sales_April_2019.csv',
 'Sales_November_2019.csv',
 'Sales_February_2019.csv',
 'Sales_December_2019.csv',
 'Sales_September_2019.csv',
 'Sales_May_2019.csv',
 'Sales_August_2019.csv',
 'Sales_July_2019.csv',
 'Sales_January_2019.csv',
 'Sales_March_2019.csv',
 'Sales_October_2019.csv']

In [3]:
all_months_data = pd.DataFrame()

for i in LIST_OF_MONTLY_SALES_DATA_FILES:
    COMPLETE_PATH = os.path.join(PATH_TO_ALL_SALES_DATA, i)
    all_months_data = pd.concat([all_months_data, pd.read_csv(COMPLETE_PATH)], axis=0)

all_months_data.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,209921,USB-C Charging Cable,1,11.95,06/23/19 19:34,"950 Walnut St, Portland, ME 04101"
1,209922,Macbook Pro Laptop,1,1700.0,06/30/19 10:05,"80 4th St, San Francisco, CA 94016"
2,209923,ThinkPad Laptop,1,999.99,06/24/19 20:18,"402 Jackson St, Los Angeles, CA 90001"
3,209924,27in FHD Monitor,1,149.99,06/05/19 10:21,"560 10th St, Seattle, WA 98101"
4,209925,Bose SoundSport Headphones,1,99.99,06/25/19 18:58,"545 2nd St, San Francisco, CA 94016"


### Saving the new combined DataFrame we just created

In [4]:
all_months_data.to_csv(os.path.join(PATH_TO_ALL_SALES_DATA, ALL_MONTHS_DATA_FILE_NAME), index=False)

### Reading the new combined DataFrame from the `.csv` file we just created

In [5]:
all_months_data = pd.read_csv(os.path.join(PATH_TO_ALL_SALES_DATA, ALL_MONTHS_DATA_FILE_NAME))

all_months_data.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,209921,USB-C Charging Cable,1,11.95,06/23/19 19:34,"950 Walnut St, Portland, ME 04101"
1,209922,Macbook Pro Laptop,1,1700.0,06/30/19 10:05,"80 4th St, San Francisco, CA 94016"
2,209923,ThinkPad Laptop,1,999.99,06/24/19 20:18,"402 Jackson St, Los Angeles, CA 90001"
3,209924,27in FHD Monitor,1,149.99,06/05/19 10:21,"560 10th St, Seattle, WA 98101"
4,209925,Bose SoundSport Headphones,1,99.99,06/25/19 18:58,"545 2nd St, San Francisco, CA 94016"


In [6]:
all_months_data.dtypes

Order ID            object
Product             object
Quantity Ordered    object
Price Each          object
Order Date          object
Purchase Address    object
dtype: object

In [7]:
print(f"{'Total number of rows in the combined dataset:':50} {all_months_data.shape[0]:>10}")
print(f"{'Total number of columns in the combined dataset:':50} {all_months_data.shape[1]:>10}")

Total number of rows in the combined dataset:          186850
Total number of columns in the combined dataset:            6


In [8]:
all_months_data.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,209921,USB-C Charging Cable,1,11.95,06/23/19 19:34,"950 Walnut St, Portland, ME 04101"
1,209922,Macbook Pro Laptop,1,1700.0,06/30/19 10:05,"80 4th St, San Francisco, CA 94016"
2,209923,ThinkPad Laptop,1,999.99,06/24/19 20:18,"402 Jackson St, Los Angeles, CA 90001"
3,209924,27in FHD Monitor,1,149.99,06/05/19 10:21,"560 10th St, Seattle, WA 98101"
4,209925,Bose SoundSport Headphones,1,99.99,06/25/19 18:58,"545 2nd St, San Francisco, CA 94016"


## Cleaning the Data

1. We are going the drop the rows have NaN

In [10]:
help(all_months_data.isna().any)

Help on method any in module pandas.core.frame:

any(axis=0, bool_only=None, skipna=True, level=None, **kwargs) method of pandas.core.frame.DataFrame instance
    Return whether any element is True, potentially over an axis.
    
    Returns False unless there at least one element within a series or
    along a Dataframe axis that is True or equivalent (e.g. non-zero or
    non-empty).
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns', None}, default 0
        Indicate which axis or axes should be reduced.
    
        * 0 / 'index' : reduce the index, return a Series whose index is the
          original column labels.
        * 1 / 'columns' : reduce the columns, return a Series whose index is the
          original index.
        * None : reduce all axes, return a scalar.
    
    bool_only : bool, default None
        Include only boolean columns. If None, will attempt to use everything,
        then use only boolean data. Not implemented for Series.
    s

In [11]:
all_months_data[all_months_data.isna().any(axis=1)]

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
339,,,,,,
630,,,,,,
735,,,,,,
1136,,,,,,
1349,,,,,,
...,...,...,...,...,...,...
184390,,,,,,
184779,,,,,,
185239,,,,,,
185614,,,,,,


In [12]:
help(all_months_data.dropna)

Help on method dropna in module pandas.core.frame:

dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) method of pandas.core.frame.DataFrame instance
    Remove missing values.
    
    See the :ref:`User Guide <missing_data>` for more on which values are
    considered missing, and how to work with missing data.
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Determine if rows or columns which contain missing values are
        removed.
    
        * 0, or 'index' : Drop rows which contain missing values.
        * 1, or 'columns' : Drop columns which contain missing value.
    
        .. versionchanged:: 1.0.0
    
           Pass tuple or list to drop on multiple axes.
           Only a single axis is allowed.
    
    how : {'any', 'all'}, default 'any'
        Determine if row or column is removed from DataFrame, when we have
        at least one NA or all NA.
    
        * 'any' : If any NA values are present, dro

In [13]:
all_months_data = all_months_data.dropna(how='any')

all_months_data.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,209921,USB-C Charging Cable,1,11.95,06/23/19 19:34,"950 Walnut St, Portland, ME 04101"
1,209922,Macbook Pro Laptop,1,1700.0,06/30/19 10:05,"80 4th St, San Francisco, CA 94016"
2,209923,ThinkPad Laptop,1,999.99,06/24/19 20:18,"402 Jackson St, Los Angeles, CA 90001"
3,209924,27in FHD Monitor,1,149.99,06/05/19 10:21,"560 10th St, Seattle, WA 98101"
4,209925,Bose SoundSport Headphones,1,99.99,06/25/19 18:58,"545 2nd St, San Francisco, CA 94016"


### Augment the data with additional columns

### Task 2: Add Month Column

#### The easy method

In [14]:
all_months_data["Month (Other Method)"] = all_months_data["Order Date"].str[:2]

all_months_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_months_data["Month (Other Method)"] = all_months_data["Order Date"].str[:2]


Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Month (Other Method)
0,209921,USB-C Charging Cable,1,11.95,06/23/19 19:34,"950 Walnut St, Portland, ME 04101",06
1,209922,Macbook Pro Laptop,1,1700.0,06/30/19 10:05,"80 4th St, San Francisco, CA 94016",06
2,209923,ThinkPad Laptop,1,999.99,06/24/19 20:18,"402 Jackson St, Los Angeles, CA 90001",06
3,209924,27in FHD Monitor,1,149.99,06/05/19 10:21,"560 10th St, Seattle, WA 98101",06
4,209925,Bose SoundSport Headphones,1,99.99,06/25/19 18:58,"545 2nd St, San Francisco, CA 94016",06
...,...,...,...,...,...,...,...
186845,278792,AA Batteries (4-pack),1,3.84,10/12/19 04:32,"920 Adams St, San Francisco, CA 94016",10
186846,278793,Wired Headphones,1,11.99,10/28/19 22:00,"161 Chestnut St, Los Angeles, CA 90001",10
186847,278794,AA Batteries (4-pack),1,3.84,10/09/19 20:58,"346 Spruce St, San Francisco, CA 94016",10
186848,278795,iPhone,1,700,10/31/19 17:21,"291 Hill St, Seattle, WA 98101",10


#### The better method

In [15]:
pd.to_datetime(all_months_data["Order Date"], format="%m/%d/%y %H:%M")

ValueError: time data 'Order Date' does not match format '%m/%d/%y %H:%M' (match)

From the above error message looks like there are some rows in the DataFrame that have the value for the column `Order Date` set to **Order Date** itself.

This is a very interesting observation from the dataset, which tells us that the dataset is not clean.

To fix this problem, let's set the `Order Date` column for all these rows to None, so that the `pd.to_datetime()` method can handle this missing data properly.

In [16]:
all_months_data.loc[all_months_data["Order Date"] == "Order Date", "Order Date"] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_months_data.loc[all_months_data["Order Date"] == "Order Date", "Order Date"] = None


In [17]:
all_months_data["Order Date"] = pd.to_datetime(all_months_data["Order Date"], format="%m/%d/%y %H:%M", errors="coerce")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_months_data["Order Date"] = pd.to_datetime(all_months_data["Order Date"], format="%m/%d/%y %H:%M", errors="coerce")


In [13]:
pd.DatetimeIndex(all_months_data["Order Date"]).month

Float64Index([ 6.0,  6.0,  6.0,  6.0,  6.0,  6.0,  6.0,  6.0,  6.0,  6.0,
              ...
              10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0],
             dtype='float64', name='Order Date', length=186850)

In [14]:
all_months_data["Month"] = pd.DatetimeIndex(all_months_data["Order Date"]).month

In [15]:
all_months_data.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Month (Other Method),Month
0,209921,USB-C Charging Cable,1,11.95,2019-06-23 19:34:00,"950 Walnut St, Portland, ME 04101",6,6.0
1,209922,Macbook Pro Laptop,1,1700.0,2019-06-30 10:05:00,"80 4th St, San Francisco, CA 94016",6,6.0
2,209923,ThinkPad Laptop,1,999.99,2019-06-24 20:18:00,"402 Jackson St, Los Angeles, CA 90001",6,6.0
3,209924,27in FHD Monitor,1,149.99,2019-06-05 10:21:00,"560 10th St, Seattle, WA 98101",6,6.0
4,209925,Bose SoundSport Headphones,1,99.99,2019-06-25 18:58:00,"545 2nd St, San Francisco, CA 94016",6,6.0


In [16]:
all_months_data.dtypes

Order ID                        object
Product                         object
Quantity Ordered                object
Price Each                      object
Order Date              datetime64[ns]
Purchase Address                object
Month (Other Method)            object
Month                          float64
dtype: object