In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
HR_Employees = pd.read_csv('HR_Employees.csv')

In [3]:
Sales_Orders = pd.read_csv('Sales_Orders.csv')

In [4]:
Sales_OrderDetails = pd.read_csv('Sales_OrderDetails.csv')

In [5]:
Production_Products = pd.read_csv('Production_Products.csv')

In [6]:
Production_Categories = pd.read_csv('Production_Categories.csv')

In [7]:
Production_Suppliers = pd.read_csv('Production_Suppliers.csv')

In [8]:
Sales_Customers = pd.read_csv('Sales_Customers.csv')

In [9]:
Sales_Shippers = pd.read_csv('Sales_Shippers.csv')

## 1- How many total customers does each employee hired after 2014 have?

In [10]:
HR_Employees.head(2)

Unnamed: 0,empid,lastname,firstname,title,titleofcourtesy,birthdate,hiredate,address,city,region,postalcode,country,phone,mgrid
0,1,Davis,Sara,CEO,Ms.,1968-12-08,2013-05-01,"7890 - 20th Ave. E., Apt. 2A",Seattle,WA,10003,USA,(206) 555-0101,
1,2,Funk,Don,"Vice President, Sales",Dr.,1972-02-19,2013-08-14,9012 W. Capital Way,Tacoma,WA,10001,USA,(206) 555-0100,1.0


In [11]:
Sales_Orders.head(2)

Unnamed: 0,orderid,custid,empid,orderdate,requireddate,shippeddate,shipperid,freight,shipname,shipaddress,shipcity,shipregion,shippostalcode,shipcountry
0,10248,85,5,2014-07-04,2014-08-01,2014-07-16,3,32.38,Ship to 85-B,6789 rue de l'Abbaye,Reims,,10345,France
1,10249,79,6,2014-07-05,2014-08-16,2014-07-10,1,11.61,Ship to 79-C,Luisenstr. 9012,Münster,,10328,Germany


In [12]:
# To join to tables, we need to find a common column in the two tables. 
#we need to merge Sales_Orders and HR_Employee tables via 'empid' column.

print(HR_Employees.empid.nunique())
print(Sales_Orders.empid.nunique())

9
9


In [13]:
HR_Employees.dtypes

empid                int64
lastname            object
firstname           object
title               object
titleofcourtesy     object
birthdate           object
hiredate            object
address             object
city                object
region              object
postalcode           int64
country             object
phone               object
mgrid              float64
dtype: object

In [14]:
Sales_Orders.dtypes

orderid             int64
custid              int64
empid               int64
orderdate          object
requireddate       object
shippeddate        object
shipperid           int64
freight           float64
shipname           object
shipaddress        object
shipcity           object
shipregion         object
shippostalcode      int64
shipcountry        object
dtype: object

In [15]:
# date data types should be changed from object to date time

HR_Employees[['hiredate', 'birthdate']] = HR_Employees[['hiredate', 'birthdate']].apply(pd.to_datetime)
Sales_Orders[['orderdate', 'requireddate', 'shippeddate']] = Sales_Orders[['orderdate', 'requireddate', 'shippeddate']].apply(pd.to_datetime)

In [16]:
# We are going to combine two tables by merge function

Merge1 = pd.merge(HR_Employees, Sales_Orders, how = 'inner', on = 'empid')
print('Shape of the Merege1 table is: ', Merge1.shape)
Merge1.head(2)

Shape of the Merege1 table is:  (830, 27)


Unnamed: 0,empid,lastname,firstname,title,titleofcourtesy,birthdate,hiredate,address,city,region,...,requireddate,shippeddate,shipperid,freight,shipname,shipaddress,shipcity,shipregion,shippostalcode,shipcountry
0,1,Davis,Sara,CEO,Ms.,1968-12-08,2013-05-01,"7890 - 20th Ave. E., Apt. 2A",Seattle,WA,...,2014-08-14,2014-07-23,1,140.51,Destination RVDMF,Kirchgasse 9012,Graz,,10157,Austria
1,1,Davis,Sara,CEO,Ms.,1968-12-08,2013-05-01,"7890 - 20th Ave. E., Apt. 2A",Seattle,WA,...,2014-08-29,2014-08-02,1,136.54,Ship to 87-B,Torikatu 2345,Oulu,,10351,Finland


In [17]:
# by pandas.loc function, we can exclude the specific values in a dataframe

Merge1_2 = Merge1.loc[Merge1['hiredate'].dt.year >= 2014]

In [18]:
# All the hire dates are later than 2014

Merge1_2.head(3)

Unnamed: 0,empid,lastname,firstname,title,titleofcourtesy,birthdate,hiredate,address,city,region,...,requireddate,shippeddate,shipperid,freight,shipname,shipaddress,shipcity,shipregion,shippostalcode,shipcountry
346,4,Peled,Yael,Sales Representative,Mrs.,1957-09-19,2014-05-03,5678 Old Redmond Rd.,Redmond,WA,...,2014-08-05,2014-07-12,2,65.83,Destination SCQXA,"Rua do Paço, 7890",Rio de Janeiro,RJ,10195,Brazil
347,4,Peled,Yael,Sales Representative,Mrs.,1957-09-19,2014-05-03,5678 Old Redmond Rd.,Redmond,WA,...,2014-08-06,2014-07-11,2,51.3,Ship to 76-B,"Boulevard Tirou, 9012",Charleroi,,10318,Belgium
348,4,Peled,Yael,Sales Representative,Mrs.,1957-09-19,2014-05-03,5678 Old Redmond Rd.,Redmond,WA,...,2014-08-13,2014-07-22,3,81.91,Destination JYDLM,Carrera1234 con Ave. Carlos Soublette #8-35,San Cristóbal,Táchira,10199,Venezuela


In [19]:
# we use groupby function to find the number of customers for each employee
# So, the groupby should be used for first name, last name and employee ID.

Merge1_2.groupby(['firstname','lastname','empid']).count()['custid'].sort_values(ascending = False).reset_index().rename(columns={'custid':'Total_Number'})

Unnamed: 0,firstname,lastname,empid,Total_Number
0,Yael,Peled,4,156
1,Maria,Cameron,8,104
2,Russell,King,7,72
3,Paul,Suurs,6,67
4,Patricia,Doyle,9,43
5,Sven,Mortensen,5,42


## 2- What is the total purchase amount by American, French, and German customers after 2007?

In [20]:
# we need to combine two tables to have all the values in a table

Merge2 = pd.merge(Sales_Orders,Sales_OrderDetails , on = 'orderid') 
Merge2.head(2)

Unnamed: 0,orderid,custid,empid,orderdate,requireddate,shippeddate,shipperid,freight,shipname,shipaddress,shipcity,shipregion,shippostalcode,shipcountry,productid,unitprice,qty,discount
0,10248,85,5,2014-07-04,2014-08-01,2014-07-16,3,32.38,Ship to 85-B,6789 rue de l'Abbaye,Reims,,10345,France,11,14.0,12,0.0
1,10248,85,5,2014-07-04,2014-08-01,2014-07-16,3,32.38,Ship to 85-B,6789 rue de l'Abbaye,Reims,,10345,France,42,9.8,10,0.0


In [21]:
# we need to exclude the values that we are looking for.

Merge2_1 = Merge2.loc[(Merge2['shipcountry'].isin(['France' , 'USA', 'Germany'])) & (Merge2['orderdate'].dt.year >= 2007)]
Merge2_1.head(2)

Unnamed: 0,orderid,custid,empid,orderdate,requireddate,shippeddate,shipperid,freight,shipname,shipaddress,shipcity,shipregion,shippostalcode,shipcountry,productid,unitprice,qty,discount
0,10248,85,5,2014-07-04,2014-08-01,2014-07-16,3,32.38,Ship to 85-B,6789 rue de l'Abbaye,Reims,,10345,France,11,14.0,12,0.0
1,10248,85,5,2014-07-04,2014-08-01,2014-07-16,3,32.38,Ship to 85-B,6789 rue de l'Abbaye,Reims,,10345,France,42,9.8,10,0.0


In [23]:
# based on the number and price, we need to add a column to calculate the total sale for each product.

Merge2_1.loc[:,('Amount')] = Merge2_1.loc[:, ('qty')] * Merge2_1.loc[:,('unitprice')] * (1-Merge2_1.loc[:,('discount')])

In [24]:
# Total sale

Merge2_1['Amount'].sum()

557227.5665000001

## 3- Which countries have made purchases exceeding 40,000 USD?

In [23]:
Merge3_1 = Merge2.copy()

In [24]:
Merge3_1.loc[:,('Amount')] = Merge2.loc[:, ('qty')] * Merge2.loc[:,('unitprice')] * (1-Merge2.loc[:,('discount')])

In [25]:
Merge3_2 = Merge3_1.groupby('shipcountry').sum()['Amount'].sort_values(ascending = False).to_frame().reset_index().rename(columns={'Amount':'Total_Amount'})
Merge3_2 = Merge3_2.loc[Merge3_2['Total_Amount'] >= 40000]
Merge3_2

Unnamed: 0,shipcountry,Total_Amount
0,USA,245584.6105
1,Germany,230284.6335
2,Austria,128003.8385
3,Brazil,106925.7765
4,France,81358.3225
5,UK,58971.31
6,Venezuela,56810.629
7,Sweden,54495.14
8,Canada,50196.29
9,Ireland,49979.905


## 4- What are the average, count, and sum of sales for each customer who purchased beverage and seafood products, and had them exported by shippers #2 and #3 to the US?

In [25]:
# for this question, we need to merge 5 tables to have all required values
Merge4 = pd.merge(Production_Categories, Production_Products, on = 'categoryid').merge(Production_Suppliers, how = 'inner', on = 'supplierid').merge(Sales_OrderDetails, how = 'inner', on = 'productid').merge(Sales_Orders, how = 'inner', on = 'orderid')
Merge4.shape

(2155, 35)

In [26]:
# unitprice_x is for sale price, and unitprice_y is for cost of production
Merge4.columns

Index(['categoryid', 'categoryname', 'description', 'productid', 'productname',
       'supplierid', 'unitprice_x', 'discontinued', 'companyname',
       'contactname', 'contacttitle', 'address', 'city', 'region',
       'postalcode', 'country', 'phone', 'fax', 'orderid', 'unitprice_y',
       'qty', 'discount', 'custid', 'empid', 'orderdate', 'requireddate',
       'shippeddate', 'shipperid', 'freight', 'shipname', 'shipaddress',
       'shipcity', 'shipregion', 'shippostalcode', 'shipcountry'],
      dtype='object')

In [27]:
Merge4_1 = Merge4.copy()

In [32]:
Merge4_1.loc[:,('Amount')] = Merge4_1.loc[:, ('qty')] * Merge4_1.loc[:,('unitprice_x')] * (1 - Merge4_1.loc[:,('discount')])

In [33]:
Merge4_2 = Merge4_1.loc[(Merge4_1.shipcountry == 'USA') & (Merge4_1.categoryid.isin([1,8])) & (Merge4_1.shipperid.isin([2,3]))]

In [34]:
Merge4_3 = Merge4_2.groupby(['custid' , 'categoryname']).agg({'Amount' : ['mean' , 'sum','count']}).reset_index()
Merge4_3

Unnamed: 0_level_0,custid,categoryname,Amount,Amount,Amount
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,sum,count
0,32,Beverages,2338.875,11694.375,5
1,32,Seafood,456.0,456.0,1
2,36,Seafood,60.0,60.0,1
3,43,Seafood,184.0,184.0,1
4,45,Beverages,162.75,162.75,1
5,45,Seafood,310.37625,620.7525,2
6,48,Beverages,108.0,216.0,2
7,48,Seafood,625.0,625.0,1
8,55,Beverages,785.0,2355.0,3
9,55,Seafood,730.7,2192.1,3


## 5- What is the latest order date for each customer?

In [35]:
customers = Sales_Orders.groupby('custid')['orderdate'].max().reset_index()
customers

Unnamed: 0,custid,orderdate
0,1,2016-04-09
1,2,2016-03-04
2,3,2016-01-28
3,4,2016-04-10
4,5,2016-03-04
...,...,...
84,87,2016-04-15
85,88,2016-03-09
86,89,2016-05-01
87,90,2016-04-07


## 6- What is the sales ranking of each customer? Please mention the ranking in a separate column.

In [36]:
Merge6 = Merge2.copy()

In [37]:
Merge6.loc[:,('Amount')] = Merge6.loc[:, ('qty')] * Merge6.loc[:,('unitprice')] * (1-Merge6.loc[:,('discount')])

In [38]:
Total = Merge6.groupby('custid')['Amount'].sum().to_frame().reset_index()
Total['Rank'] = Total.Amount.rank(ascending = False).astype(int)
Total

Unnamed: 0,custid,Amount,Rank
0,1,4273.0000,57
1,2,1402.9500,84
2,3,7023.9775,46
3,4,13390.6500,31
4,5,24927.5775,13
...,...,...,...
84,87,15648.7025,28
85,88,6068.2000,52
86,89,27363.6050,10
87,90,3161.3500,69
