## Part 1: Explore the Data

Import the data and use Pandas to learn more about the dataset.

In [132]:
import pandas as pd

df = pd.read_csv('Resources/client_dataset.csv')
df.head()

Unnamed: 0,first,last,job,phone,email,client_id,order_id,order_date,order_week,order_year,item_id,category,subcategory,unit_price,unit_cost,unit_weight,qty,line_number
0,Donald,Harding,Immunologist,793-904-7725x39308,harding.donald.7185@sullivan.com,58515,8953482,2023-04-28,17,2023,EUD29711-63-6U,decor,wall art,1096.8,762.71,7.5,105,1
1,Tiffany,Myers,Music therapist,201.442.4543x942,myers.t.6537@ferguson-johnson.net,37609,8069089,2023-05-19,20,2023,XDA18116-89-4A,consumables,pens,24.95,15.09,1.49,21,0
2,Shannon,Watson,Immunologist,687.737.9424x8503,swatson8146@payne.net,57113,1902144,2023-01-29,4,2023,ABE59463-05-7E,software,project management,13.52,7.86,1.68,39,6
3,Nathan,Baker,Accounting technician,827-788-8123x012,bakernathan@benson.com,46554,9031802,2023-04-25,17,2023,ZMM00836-65-0C,consumables,pens,36.42,24.85,1.23,29,3
4,Christina,Schwartz,Chiropractor,265-829-3643,christinaschwartz9252@mcconnell.com,92089,1322274,2023-05-28,21,2023,BZX55559-12-3X,consumables,misc,195.1,108.17,46.43,20,1


In [133]:
# View the column names in the data
print(df.columns)

Index(['first', 'last', 'job', 'phone', 'email', 'client_id', 'order_id',
       'order_date', 'order_week', 'order_year', 'item_id', 'category',
       'subcategory', 'unit_price', 'unit_cost', 'unit_weight', 'qty',
       'line_number'],
      dtype='object')


In [134]:
# Use the describe function to gather some basic statistics
stats = df.describe()
print(stats)

          client_id      order_id    order_week    order_year    unit_price  \
count  54639.000000  5.463900e+04  54639.000000  54639.000000  54639.000000   
mean   54837.869416  5.470190e+06     11.359139   2022.993064    136.267207   
std    25487.438231  2.599807e+06      7.023499      0.082997    183.873135   
min    10033.000000  1.000886e+06      1.000000   2022.000000      0.010000   
25%    33593.000000  3.196372e+06      6.000000   2023.000000     20.800000   
50%    53305.000000  5.496966e+06     11.000000   2023.000000     68.310000   
75%    78498.000000  7.733869e+06     17.000000   2023.000000    173.160000   
max    99984.000000  9.998480e+06     52.000000   2023.000000   1396.230000   

          unit_cost   unit_weight           qty   line_number  
count  54639.000000  54639.000000  5.463900e+04  54639.000000  
mean      99.446073      5.004116  5.702646e+02      2.979667  
std      133.164267      5.326599  1.879552e+04      2.436320  
min        0.010000      0.00000

In [135]:
# Use this space to do any additional research
# and familiarize yourself with the data.
df.dtypes
df['order_id'].value_counts

<bound method IndexOpsMixin.value_counts of 0        8953482
1        8069089
2        1902144
3        9031802
4        1322274
          ...   
54634    9021716
54635    6290153
54636    8692622
54637    7592730
54638    7489403
Name: order_id, Length: 54639, dtype: int64>

In [136]:
# What three item categories had the most entries?
#~Count non-null entries.
non_null_counts = df.count()

#~Select the three columns with the most entries.
top_3 = non_null_counts.nlargest(3)
print(top_3)

first    54639
last     54639
job      54639
dtype: int64


In [137]:
# For the category with the most entries, which subcategory had the most entries?
sub_entries = top_3.nlargest(1)
print(sub_entries)

first    54639
dtype: int64


In [138]:
# Which five clients had the most entries in the data?
#~order the counts based on the number of instances of the clients order data.
most_entries = df['client_id'].value_counts()
#~print the top 5 entries with the .head command.
most_entries.head()

client_id
33615    220
66037    211
46820    209
24741    207
38378    207
Name: count, dtype: int64

In [139]:
# Store the client ids of those top 5 clients in a list.
#~For some reason the earlier command is displaying the number of times the top 5 clients show up in the data. I need it to display the clients IDs not the quantities.
top_5_clients = list(df['client_id'].value_counts().head(5).index)
top_5_clients


[33615, 66037, 46820, 24741, 38378]

In [140]:
# How many total units (the qty column) did the client with the most entries order?
#~ Identify the current most frequently entered customer ID.
num1client = most_entries.idxmax()

#~ Select all of order quantities from every transaction with the number 1 client and assign them a variable.
counts = df.loc[df['client_id'] == num1client, 'qty'].tolist()

#~ Create a variable for the sum of all of the order quantities.
total_count = sum(counts)

#~ return the total number of units ordered by the number 1 client.
print(total_count)


64313


## Part 2: Transform the Data
Do we know that this client spent the more money than client 66037? If not, how would we find out? Transform the data using the steps below to prepare it for analysis.

In [141]:
# Create a column that calculates the subtotal for each line using the unit_price and the qty
df['subtotal'] = df['unit_price'] * df['qty']

print(df[['unit_price', 'qty', 'subtotal']])

       unit_price   qty   subtotal
0         1096.80   105  115164.00
1           24.95    21     523.95
2           13.52    39     527.28
3           36.42    29    1056.18
4          195.10    20    3902.00
...           ...   ...        ...
54634       83.13    33    2743.29
54635      206.59    47    9709.73
54636       65.66   475   31188.50
54637        1.48   112     165.76
54638        3.01  1031    3103.31

[54639 rows x 3 columns]


In [142]:
# Create a column for shipping price.
# Assume a shipping price of $7 per pound for orders over 50 pounds and $10 per pound for items 50 pounds or under.
df['shipping_weight'] = df['unit_weight'] * df['qty']

df['shipping_price'] = df.apply(
    lambda row: row['shipping_weight'] * 7 if row['shipping_weight'] > 50 
    else row['shipping_weight'] * 10, axis=1
    )

print(df[['unit_weight', 'qty', 'shipping_weight', 'shipping_price']])

       unit_weight   qty  shipping_weight  shipping_price
0             7.50   105           787.50         5512.50
1             1.49    21            31.29          312.90
2             1.68    39            65.52          458.64
3             1.23    29            35.67          356.70
4            46.43    20           928.60         6500.20
...            ...   ...              ...             ...
54634         2.25    33            74.25          519.75
54635        11.70    47           549.90         3849.30
54636         4.16   475          1976.00        13832.00
54637        18.04   112          2020.48        14143.36
54638         2.07  1031          2134.17        14939.19

[54639 rows x 4 columns]


In [143]:
# Create a column for the total price using the subtotal and the shipping price along with a sales tax of 9.25%.
#define tax rate.
df['tax_rate'] = float(.0925)

#Define calculated tax.
df['tax'] = df['tax_rate'] * (df['shipping_price'] + df['subtotal'])

#Define total.
df['total'] = df['tax'] + df['subtotal'] + df['shipping_price']

#Retun the total.
df["total"]

0        131839.076250
1           914.258625
2          1077.117600
3          1543.571400
4         11364.403500
             ...      
54634      3564.871200
54635     14813.240275
54636     49184.896250
54637     15632.713600
54638     19711.431250
Name: total, Length: 54639, dtype: float64

In [144]:
# Create a column for the cost of each line using unit cost, qty, and
# shipping price (assume the shipping cost is exactly what is charged to the client).
#define line total.
df['line_total'] = (df[
'subtotal'] + df['shipping_price'] + df['tax']) + df['subtotal']
#return the line total.
df[['unit_weight', 'qty', 'shipping_weight', 'shipping_price', 'subtotal', 'tax', 'line_total']]
df.describe()

Unnamed: 0,client_id,order_id,order_week,order_year,unit_price,unit_cost,unit_weight,qty,line_number,subtotal,shipping_weight,shipping_price,tax_rate,tax,total,line_total
count,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0
mean,54837.869416,5470190.0,11.359139,2022.993064,136.267207,99.446073,5.004116,570.2646,2.979667,77102.86,2487.416,17425.1,0.0925,8743.836,103271.8,180374.6
std,25487.438231,2599807.0,7.023499,0.082997,183.873135,133.164267,5.326599,18795.52,2.43632,2881926.0,55594.3,389159.5,0.0,278262.4,3286505.0,6156421.0
min,10033.0,1000886.0,1.0,2022.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0925,0.0,0.0,0.0
25%,33593.0,3196372.0,6.0,2023.0,20.8,14.84,1.45,32.0,1.0,1132.95,70.33,496.65,0.0925,244.4849,2887.565,4293.009
50%,53305.0,5496966.0,11.0,2023.0,68.31,49.89,3.24,68.0,3.0,4629.44,225.4,1577.8,0.0925,710.9152,8396.485,13402.57
75%,78498.0,7733869.0,17.0,2023.0,173.16,125.57,6.89,170.0,5.0,17369.13,718.44,5029.08,0.0925,2239.986,26456.05,44001.73
max,99984.0,9998480.0,52.0,2023.0,1396.23,846.27,46.43,3958244.0,9.0,584474300.0,9009056.0,63063390.0,0.0925,54294540.0,641262500.0,1225737000.0


In [145]:
# Create a column for the profit of each line using line cost and line price
df['profit'] = df['line_total'] - df['shipping_price'] - df['tax']

#Return the result.
print(df['profit'])
df.describe()

0        230328.00
1          1047.90
2          1054.56
3          2112.36
4          7804.00
           ...    
54634      5486.58
54635     19419.46
54636     62377.00
54637       331.52
54638      6206.62
Name: profit, Length: 54639, dtype: float64


Unnamed: 0,client_id,order_id,order_week,order_year,unit_price,unit_cost,unit_weight,qty,line_number,subtotal,shipping_weight,shipping_price,tax_rate,tax,total,line_total,profit
count,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0,54639.0
mean,54837.869416,5470190.0,11.359139,2022.993064,136.267207,99.446073,5.004116,570.2646,2.979667,77102.86,2487.416,17425.1,0.0925,8743.836,103271.8,180374.6,154205.7
std,25487.438231,2599807.0,7.023499,0.082997,183.873135,133.164267,5.326599,18795.52,2.43632,2881926.0,55594.3,389159.5,0.0,278262.4,3286505.0,6156421.0,5763852.0
min,10033.0,1000886.0,1.0,2022.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0925,0.0,0.0,0.0,0.0
25%,33593.0,3196372.0,6.0,2023.0,20.8,14.84,1.45,32.0,1.0,1132.95,70.33,496.65,0.0925,244.4849,2887.565,4293.009,2265.9
50%,53305.0,5496966.0,11.0,2023.0,68.31,49.89,3.24,68.0,3.0,4629.44,225.4,1577.8,0.0925,710.9152,8396.485,13402.57,9258.88
75%,78498.0,7733869.0,17.0,2023.0,173.16,125.57,6.89,170.0,5.0,17369.13,718.44,5029.08,0.0925,2239.986,26456.05,44001.73,34738.26
max,99984.0,9998480.0,52.0,2023.0,1396.23,846.27,46.43,3958244.0,9.0,584474300.0,9009056.0,63063390.0,0.0925,54294540.0,641262500.0,1225737000.0,1168949000.0


## Part 3: Confirm your work
You have email receipts showing that the total prices for 3 orders. Confirm that your calculations match the receipts. Remember, each order has multiple lines.

Order ID 2742071 had a total price of \$152,811.89

Order ID 2173913 had a total price of \$162,388.71

Order ID 6128929 had a total price of \$923,441.25


In [146]:
# Check your work using the totals above
order_totals_1 = df.groupby('order_id')
order_totals_1 = df[df['order_id'] == 2742071]
order_totals_2 = df[df['order_id'] == 2173913]
order_totals_3 = df[df['order_id'] == 6128929]
order_totals = pd.concat([order_totals_1, order_totals_2, order_totals_3, ])
order_totals_3.head()
order_totals_df = order_totals.groupby('order_id').sum()
order_totals_df


Unnamed: 0_level_0,first,last,job,phone,email,client_id,order_date,order_week,order_year,item_id,...,qty,line_number,subtotal,shipping_weight,shipping_price,tax_rate,tax,total,line_total,profit
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2173913,DariusDariusDariusDariusDariusDariusDarius,StevensStevensStevensStevensStevensStevensStevens,"Physicist, medicalPhysicist, medicalPhysicist,...",+1-220-240-7019x2774+1-220-240-7019x2774+1-220...,stevensdarius@rich-leach.netstevensdarius@rich...,154854,2023-02-192023-02-192023-02-192023-02-192023-0...,49,14161,CDB25978-21-3EXED42806-14-3MCXD77584-10-1EZEM1...,...,1221,21,109440.76,5576.57,39198.8,0.6475,13749.1593,162388.7193,271829.5,218881.52
2742071,KimberlyKimberlyKimberlyKimberlyKimberlyKimberly,BlairBlairBlairBlairBlairBlair,Geographical information systems officerGeogra...,477-680-9323x27378477-680-9323x27378477-680-93...,blair.kimberly.6618@nelson.bizblair.kimberly.6...,470682,2023-02-272023-02-272023-02-272023-02-272023-0...,54,12138,ACU90518-83-2BBMD72525-90-2XDEE52027-40-3UAEC6...,...,439,15,127799.33,1691.98,12074.26,0.555,12938.307075,152811.897075,280611.2,255598.66
6128929,DorothyDorothyDorothyDorothyDorothyDorothyDoro...,HallHallHallHallHallHallHallHallHallHall,"Journalist, broadcastingJournalist, broadcasti...",001-338-609-8495x03504001-338-609-8495x0350400...,dorothy.per.1343@mitchell-chambers.comdorothy....,882570,2023-03-122023-03-122023-03-122023-03-122023-0...,100,20230,DUX14130-38-6UXDX02308-74-0XUUX03397-27-4CEED3...,...,4594,45,688564.27,22364.97,156690.87,0.925,78186.10045,923441.24045,1612006.0,1377128.54


## Part 4: Summarize and Analyze
Use the new columns with confirmed values to find the following information.

In [147]:
# How much did each of the top 5 clients by quantity spend? Check your work from Part 1 for client ids.
def top_5(client_id, column): 
    client_df = df.loc[df['client_id'] == client_id, column]
    return round(client_df.sum(), 2)

for x in top_5_clients:
    print(f"{x}: ${top_5(x, 'total')}")


33615: $8377308.57
66037: $10259514.8
46820: $9743794.32
24741: $82268891.98
38378: $12906550.88


In [148]:
# Create a summary DataFrame showing the totals for the for the top 5 clients with the following information:
# total units purchased, total shipping price, total revenue, and total profit. 
data_dict = {'client_id': top_5_clients}
top_5_data = df[df['client_id'].isin(top_5_clients)]

summary_data = top_5_data.groupby('client_id').agg({
                                                    'qty':'sum',
                                                    'shipping_price':'sum',
                                                    'subtotal':'sum',
                                                    'unit_cost':'sum',
                                                    'profit':'sum'
                                                    }).reset_index()
summary_data

Unnamed: 0,client_id,qty,shipping_price,subtotal,unit_cost,profit
0,24741,239862,5126448.37,70176885.25,17900.04,140353800.0
1,33615,64313,1828984.89,5839032.11,21716.88,11678060.0
2,38378,73667,3429455.4,8384321.15,21702.21,16768640.0
3,46820,75768,1601448.84,7317356.03,18140.06,14634710.0
4,66037,43018,1395151.85,7995708.38,21508.98,15991420.0


In [149]:
summary_data.describe()

Unnamed: 0,client_id,qty,shipping_price,subtotal,unit_cost,profit
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,41918.2,99325.6,2676298.0,19942660.0,20193.634,39885320.0
std,15670.64535,79624.481118,1587849.0,28098540.0,1987.7094,56197090.0
min,24741.0,43018.0,1395152.0,5839032.0,17900.04,11678060.0
25%,33615.0,64313.0,1601449.0,7317356.0,18140.06,14634710.0
50%,38378.0,73667.0,1828985.0,7995708.0,21508.98,15991420.0
75%,46820.0,75768.0,3429455.0,8384321.0,21702.21,16768640.0
max,66037.0,239862.0,5126448.0,70176890.0,21716.88,140353800.0


In [150]:
summary_data.columns

Index(['client_id', 'qty', 'shipping_price', 'subtotal', 'unit_cost',
       'profit'],
      dtype='object')

In [151]:
# Format the data and rename the columns to names suitable for presentation.
summary_data = summary_data.rename(columns={
                                            'client_id' : 'Client ID',
                                            'qty' : 'Quantity'})
# Define the money columns.
money_columns = ['shipping_price','subtotal','unit_cost','profit']


# Define a function that converts a dollar amount to millions.
def currency_format_millions(x):
    return x / 1000000

# Apply the currency_format_millions function to only the money columns.
for column in money_columns:
    summary_data[column] = summary_data[column].apply(currency_format_millions)


# Rename the columns to reflect the change in the money format. 
summary_data = summary_data.rename(columns ={
                                            'shipping_price' : 'Shipping(millions)',
                                            'subtotal' : 'Total Revenue(millions)',
                                            'unit_cost' : 'Total Cost(millions)',
                                            'profit' : 'Total Profit(millions)'})

print(summary_data)

   Client ID  Quantity  Shipping(millions)  Total Revenue(millions)  \
0      24741    239862            5.126448                70.176885   
1      33615     64313            1.828985                 5.839032   
2      38378     73667            3.429455                 8.384321   
3      46820     75768            1.601449                 7.317356   
4      66037     43018            1.395152                 7.995708   

   Total Cost(millions)  Total Profit(millions)  
0              0.017900              140.353771  
1              0.021717               11.678064  
2              0.021702               16.768642  
3              0.018140               14.634712  
4              0.021509               15.991417  


In [152]:
# Sort the updated data by "Total Profit (millions)" form highest to lowest and assign the sort to a new DatFrame.
sorted_profit = summary_data.sort_values(by='Total Profit(millions)', ascending=False)
sorted_profit

Unnamed: 0,Client ID,Quantity,Shipping(millions),Total Revenue(millions),Total Cost(millions),Total Profit(millions)
0,24741,239862,5.126448,70.176885,0.0179,140.353771
2,38378,73667,3.429455,8.384321,0.021702,16.768642
4,66037,43018,1.395152,7.995708,0.021509,15.991417
3,46820,75768,1.601449,7.317356,0.01814,14.634712
1,33615,64313,1.828985,5.839032,0.021717,11.678064
