### Import Libraries

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import glob

### Importing all the sales files

In [None]:
path = '../input/monthly-sales-2019'
all_files = glob.glob(path + "/*.csv")
data = pd.DataFrame()
for file in all_files:
    df = pd.read_csv(file)
    data = pd.concat([data, df])
    
# data.to_csv('All_Months_Data.csv', index=False)

here I concated every files in the directory

### Exploratory Data Analysis

In [None]:
data.head()

In [None]:
data.sort_values(by=['Order ID'], inplace=True) # Sorting by Order ID

In [None]:
data.tail() # looking at the last values in a dataset

All the last values are NaN. So, now i will look at how many NaN values are there

In [None]:
data.isna().sum()

Here we can see 545 rows are NaN so we don't need it we are going to remove all the NaN values

In [None]:
nan_df = data[data.isna().any(axis=1)] 
nan_df.head()

In [None]:
data.dropna(how='all', inplace=True)
data.isna().sum()

Now there's no NaN values left in each column

In [None]:
data.info()

All columns has a data type object(string) and there are total 186305 rows in a dataset

In [None]:
data.columns = ['Order_ID', 'Product', 'Quantity_Ordered', 'Price_Each', 'Order_Date', 'Purchase_Address']
data.head()

Just removed the spaces from the columns name

Now we need Date and Month column so I'm going to slice the order date column values

In [None]:
data['Date'] = data['Order_Date'].str[:8]
data['Month'] = data['Order_Date'].str[:2]
data.head()

to check the unique value counts

In [None]:
data.Month.value_counts()

Here we can see that "Or" value which is not a month so we are going to remove those rows which has Or values in Month Column

In [None]:
temp_data = data[data['Order_Date'].str[:2] == 'Or']
temp_data

In [None]:
data = data[data['Order_Date'].str[:2] != 'Or']
data.head()

Now we should check the length of a dataset

In [None]:
data.shape  # there is 185950 rows in a dataset

#### Converting Data types 

In [None]:
data['Order_Date'] = pd.to_datetime(data['Order_Date'])

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data['Month'] = data['Month'].astype('int32')
data.info()

Here now after converting the data types of Order Date and Date column into datetime data type

In [None]:
data.Order_ID.value_counts()

Order ID's are repeating so we can't set the order id as an index.

In [None]:
data.Quantity_Ordered.value_counts()

All the values in Quantity Ordered column are Integer so have to convert it into Integer and Price Column to float.

In [None]:
data['Quantity_Ordered'] = data['Quantity_Ordered'].astype('int32')
data['Price_Each'] = data['Price_Each'].astype('float64')
data.info()

So now we have 2 Integer Columns and 1 Float Column so now we can use describe method on our dataset

In [None]:
data.describe()

Maximum Quantity Ordered is 9

Now we have to make sales column for further analysis

In [None]:
data['Sales'] = data['Quantity_Ordered'] * data['Price_Each']
data.head()

In [None]:
data.groupby('Month').sum()

Month 12 which is December has a better sales

In [None]:
result_month = data.groupby(data.Date.dt.month).sum()
result_month

Here we can again see it that December has a better sales by approaching dt attributes

In [None]:
result_year = data.groupby(data.Date.dt.year).sum()
result_year

### Plotting

In [None]:
months = range(1, 13)
plt.bar(months, result_month['Sales'])
plt.title('Sales per Month')
plt.xticks(months)
plt.xlabel('Months')
plt.ylabel('Sales')
plt.show()

Making City Column to visualize which city have a better sales in 2019

In [None]:
data['City'] = data['Purchase_Address'].apply(lambda x: x.split(',')[1])
data.head()

In [None]:
data.City.value_counts()

San Francisco has a better sales of 44732 of total products in 2019.

In [None]:
state = data['Purchase_Address'].apply(lambda x: x.split(', ')[2])
state = state.str[:2]
data['State'] = state
data.head()

In [None]:
result_city = data.groupby(['City', 'State']).sum()
result_city

Here we can see the report that San Francisco has a better sales. The city which has a lowest sales is Austin which is in Texas

In [None]:
result_city.index

In [None]:
data['Address'] = data['City'] + " " + data['State']
data.head()

In [None]:
result_address = data.groupby(['Address']).sum()
result_address

### Plotting

In [None]:
city = [c for c, df in data.groupby('Address')]
plt.bar(city, result_address['Sales'])
plt.title('Sales per City')
plt.xticks(result_address.index, rotation='vertical')
plt.xlabel('Cities')
plt.ylabel('Sales')
plt.show()

Now let's further see which product is ordered the most

In [None]:
data.groupby('Product')['Quantity_Ordered'].count()

AAA batteries are ordered the most and LG Dryer are ordered less, we have to further explore what was the reason of the most or less product ordered

In [None]:
data['Hour'] = data.Order_Date.dt.hour
data.head()

In [None]:
data.groupby('Hour').count()

At 7 PM (19:00) is the best time of a sales in a day

In [None]:
hours = [hour for hour, df in data.groupby('Hour')]
plt.plot(hours, data.groupby('Hour').count())
plt.grid(True)
plt.xticks(hours)
plt.xlabel('Hours in 24 Hr Format')
plt.ylabel('Orders')
plt.show()

Here we can see the peak at 7 PM

In [None]:
data.head()

At above we have seen that Order Id's are repeating so we have to see which Order IDs are repeating

In [None]:
data[data['Order_ID'].duplicated(keep=False)]

In [None]:
df = data[data['Order_ID'].duplicated(keep=False)]
df['Grouped'] = df.groupby('Order_ID')['Product'].transform(lambda x: ','.join(x))
df = df[['Order_ID', 'Grouped']].drop_duplicates()
df.head()

Here we are seeing the top repeating order id

In [None]:
from itertools import combinations
from collections import Counter

count = Counter()

for row in df['Grouped']:
    row_list = row.split(',')
    count.update(Counter(combinations(row_list, 2)))
    
for key, value in count.most_common(10):
    print(key, value)

In combo of 2 products iPhone and Lighting Charging Cable sold the most

In [None]:
count = Counter()
for row in df['Grouped']:
    row_list = row.split(',')
    count.update(Counter(combinations(row_list, 3)))
    
for key, value in count.most_common(10):
    print(key, value)

In combo of 3 products Google Phone, USB-C Charging Cable and Wired Headphones sold the most

In [None]:
count = Counter()
for row in df['Grouped']:
    row_list = row.split(',')
    count.update(Counter(combinations(row_list, 4)))
    
for key, value in count.most_common(10):
    print(key, value)

In combo of 4 products Apple Airpods Headphones, Wired Headphones, Lightning Charging Cable, iPhone sold the most

In [None]:
count = Counter()
for row in df['Grouped']:
    row_list = row.split(',')
    count.update(Counter(combinations(row_list, 1)))
    
for key, value in count.most_common(10):
    print(key, value)

The product which singly sold the most is: USB-C Charging Cable

In [None]:
count = Counter()
for row in df['Grouped']:
    row_list = row.split(',')
    count.update(Counter(combinations(row_list, 5)))
    
for key, value in count.most_common(10):
    print(key, value)

In [None]:
data.head()

In [None]:
result_product = data.groupby('Product')['Quantity_Ordered'].sum()
result_product

### Plotting

In [None]:
result_product = data.groupby('Product')
quantity_ordered = result_product.sum()['Quantity_Ordered']
products = [product for product, df in result_product]

plt.bar(products, quantity_ordered)
plt.ylabel("Num of Ordered")
plt.xlabel("Product Name")
plt.xticks(products, rotation='vertical')
plt.show()

In [None]:
prices = data.groupby('Product')['Price_Each'].mean()

fig, ax1 = plt.subplots()

ax2 = ax1.twinx()

ax1.bar(products, quantity_ordered, color='g')
ax2.plot(products, prices, 'b-')

ax1.set_xlabel('Product Name')
ax1.set_ylabel('Quantity Ordered', color='g')
ax2.set_ylabel('Price ($)', color='b')
ax1.set_xticklabels(products, rotation='vertical')

plt.show()

In [None]:
# Price High => Quantity Ordered Low
# Price Low => Quantity Ordered High