In [2]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

The data set used was the [Goverment of California's 2012-2015 Purchase Orders ](https://data.ca.gov/dataset/purchase-order-data/resource/bb82edc5-9c78-44e2-8947-68ece26197c5) that are mapped the United Nations Standard Products & Services Code (UNSPSC). A Group Title column was added using excel (vlookup) for Goods and Services based on Segment Title. 
 

In [3]:
# import dataset
df = pd.read_csv('GoC Spend Data.csv', encoding ='utf-8')

#### Initial Exploration

The dataset is made up of 344,504 rows (line items) with 33 columns comprised of 9 float and 23 object datatypes. The dataset has numerous columns specific to the Goverment of California (e.g. Acquisition Method), which for this capstone not relevant and therefor will not be further explored.

In [4]:
# peak into the dataset
df.head(5)

Unnamed: 0,Creation Date,Purchase Date,Fiscal Year,LPA Number,Purchase Order Number,Requisition Number,Acquisition Type,Sub-Acquisition Type,Acquisition Method,Sub-Acquisition Method,...,Commodity Title,Class,Class Title,Family,Family Title,Segment,Segment Title,Group Title,Location,REMOVE AMERISOURCE
0,5/14/2014,5/14/2014,2013-2014,,W0000828,,NON-IT Goods,,Fair and Reasonable,,...,Nurses uniforms,53102700.0,Uniforms,53100000.0,Clothing,53000000.0,Apparel and Luggage and Personal Care Products,Goods,"91789\n(34.019668, -117.856008)",
1,8/2/2013,7/18/2013,2013-2014,,4500190668,,NON-IT Goods,,Informal Competitive,,...,Hats,53102500.0,Clothing accessories,53100000.0,Clothing,53000000.0,Apparel and Luggage and Personal Care Products,Goods,98733\n,
2,10/18/2013,10/18/2013,2013-2014,1-12-65-65-01-E,W0000487,,NON-IT Goods,,Statewide Contract,,...,Adult diapers,53102300.0,Undergarments,53100000.0,Clothing,53000000.0,Apparel and Luggage and Personal Care Products,Goods,,
3,3/6/2014,3/6/2014,2013-2014,,065D3010,,NON-IT Goods,,SB/DVBE Option,,...,Police uniforms,53102700.0,Uniforms,53100000.0,Clothing,53000000.0,Apparel and Luggage and Personal Care Products,Goods,"92101\n(32.72112, -117.166986)",
4,6/13/2014,5/22/2014,2013-2014,,13-501-0338,,NON-IT Goods,,Informal Competitive,,...,Adult diapers,53102300.0,Undergarments,53100000.0,Clothing,53000000.0,Apparel and Luggage and Personal Care Products,Goods,,


In [5]:
# check dataset shape
df.shape

(344504, 33)

In [6]:
# inspect dataset make up
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344504 entries, 0 to 344503
Data columns (total 33 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Creation Date            344504 non-null  object 
 1   Purchase Date            327083 non-null  object 
 2   Fiscal Year              344504 non-null  object 
 3   LPA Number               90897 non-null   object 
 4   Purchase Order Number    344504 non-null  object 
 5   Requisition Number       14366 non-null   object 
 6   Acquisition Type         344504 non-null  object 
 7   Sub-Acquisition Type     68334 non-null   object 
 8   Acquisition Method       344504 non-null  object 
 9   Sub-Acquisition Method   30883 non-null   object 
 10  Department Name          344504 non-null  object 
 11  Supplier Code            344468 non-null  float64
 12  Supplier Name            344468 non-null  object 
 13  Supplier Qualifications  141745 non-null  object 
 14  Supp

# drop all taxonomy titles
drop_target_columns = ['Family Title','Class Title', 'Commodity Title', 'Group Title', 'Segment Title']

for columns in drop_target_columns:
    df = df.drop([columns], axis=1)

In [7]:
# explore object columns
df.describe(include=['object']).transpose()

Unnamed: 0,count,unique,top,freq
Creation Date,344504,1015,6/27/2014,1531
Purchase Date,327083,2266,7/1/2014,4433
Fiscal Year,344504,3,2013-2014,120158
LPA Number,90897,1418,7-11-51-02,9267
Purchase Order Number,344504,196899,4500211314,602
Requisition Number,14366,5996,REQ0008872,123
Acquisition Type,344504,5,NON-IT Goods,213578
Sub-Acquisition Type,68334,25,Personal Services,16104
Acquisition Method,344504,20,Informal Competitive,82046
Sub-Acquisition Method,30883,16,Fleet,14148


In [8]:
df.shape

(344504, 33)

Kept only relevant columns to reduce dataset set and computational requires for cleaning. While the model only incorporates Supplier Name, Item Name and Item Description as features and Segment as target, kept Quantity, Unit Price and Total price for future analysis and cleaning. 

In [9]:
# only keep relevant columns going forward
df = df[['Quantity', 'Unit Price', 'Total Price', 'Supplier Name', 'Item Name', 'Item Description', 'Segment Title', 'Group Title']]

#### Cleaning Process

In [10]:
df

Unnamed: 0,Quantity,Unit Price,Total Price,Supplier Name,Item Name,Item Description,Segment Title,Group Title
0,1.0,3593.00,3593.00,"A&M Uniforms, Inc.",uniforms,uniforms,Apparel and Luggage and Personal Care Products,Goods
1,1.0,995.04,995.04,IBHP INCORPORATED,hat,hat,Apparel and Luggage and Personal Care Products,Goods
2,1.0,2777.60,2777.60,McKesson Medical - Surgical Minnesota Su,adult diapers,adult diapers,Apparel and Luggage and Personal Care Products,Goods
3,100.0,4.22,422.00,SUCCOR INTERNATIONAL,"Flashlight Pouch, Black",Condor # MA48-002,Apparel and Luggage and Personal Care Products,Goods
4,60.0,24.00,1440.00,Edwards Medical,Large Pullup Breifs,Large Pullup Breifs - Covidien,Apparel and Luggage and Personal Care Products,Goods
...,...,...,...,...,...,...,...,...
344499,1.0,5784.95,5784.95,Pressman's Pride,Ink,Ink,,
344500,1.0,1047.00,1047.00,Capital Datacorp,Lansweeper Premium Subscription - 1 year,Lansweeper Premium Subscription - 1 year,,
344501,1.0,5237.45,5237.45,AVANT PRINTING,artwork frames,artwork frames,,
344502,1.0,6292.66,6292.66,Technology Integration Group,Dell Powervault 114X,Dell Powervault 114X,,


In [11]:
# drop rows that aren't categorized
df = df[df['Segment Title'].notna()]

# drop rows with negative or 0 prices
df = df.drop(df[df['Total Price'].astype(float) <= 0].index)

# drop suppliers listed as 'unknown'
df = df.drop(df[df['Supplier Name'] == 'Unknown'].index)

# drop rows with item description as 'confidential'
df = df.drop(df[df['Item Description'] == 'confidential'].index)

# drop rows with item name missing
df = df.drop(df[df['Item Name'].isna() == True].index)

# drop rows with item description missing
df = df.drop(df[df['Item Description'].isna() == True].index)

# drop rows with supplier name missing
df = df.drop(df[df['Supplier Name'].isna() == True].index)

In [12]:
# create and merge group title dummy, drop first and droup original group title column containing strings

#create group title dummy creating a binaray column with Services as 1 and Goods as 0
dummy_group_title = pd.get_dummies(df['Group Title'], drop_first = True)

# drop Group Title column as it is no longer needed
df = df.drop('Group Title', axis=1)

# merge group dummy to df
df = pd.merge(df,dummy_group_title,how='outer',left_index=True,right_index=True)

In [13]:
df

Unnamed: 0,Quantity,Unit Price,Total Price,Supplier Name,Item Name,Item Description,Segment Title,Services
0,1.0,3593.00,3593.00,"A&M Uniforms, Inc.",uniforms,uniforms,Apparel and Luggage and Personal Care Products,0
1,1.0,995.04,995.04,IBHP INCORPORATED,hat,hat,Apparel and Luggage and Personal Care Products,0
2,1.0,2777.60,2777.60,McKesson Medical - Surgical Minnesota Su,adult diapers,adult diapers,Apparel and Luggage and Personal Care Products,0
3,100.0,4.22,422.00,SUCCOR INTERNATIONAL,"Flashlight Pouch, Black",Condor # MA48-002,Apparel and Luggage and Personal Care Products,0
4,60.0,24.00,1440.00,Edwards Medical,Large Pullup Breifs,Large Pullup Breifs - Covidien,Apparel and Luggage and Personal Care Products,0
...,...,...,...,...,...,...,...,...
341206,1408.0,0.95,1341.12,McGinty Sales Company,RMA Cheese Lunch,RMA Cheese Lunch,Travel and Food and Lodging and Entertainment ...,1
341207,160.0,24.51,3921.60,Scofield Catering & Management,Fire Meal-Dinner,Fire Meal Dinner,Travel and Food and Lodging and Entertainment ...,1
341208,1.0,180.00,180.00,"Class Act Alliance, Inc.",Interpreter,Contractor will provide a Captioner for a tele...,Travel and Food and Lodging and Entertainment ...,1
341209,27000.0,1.20,32400.00,McGinty Sales Company,Box Lunch Meat/Cheese,Box Lunch Meat/Cheese,Travel and Food and Lodging and Entertainment ...,1


In [14]:
df = df.rename(columns={'Segment Title': 'Segment Target'})

In [15]:
df

Unnamed: 0,Quantity,Unit Price,Total Price,Supplier Name,Item Name,Item Description,Segment Target,Services
0,1.0,3593.00,3593.00,"A&M Uniforms, Inc.",uniforms,uniforms,Apparel and Luggage and Personal Care Products,0
1,1.0,995.04,995.04,IBHP INCORPORATED,hat,hat,Apparel and Luggage and Personal Care Products,0
2,1.0,2777.60,2777.60,McKesson Medical - Surgical Minnesota Su,adult diapers,adult diapers,Apparel and Luggage and Personal Care Products,0
3,100.0,4.22,422.00,SUCCOR INTERNATIONAL,"Flashlight Pouch, Black",Condor # MA48-002,Apparel and Luggage and Personal Care Products,0
4,60.0,24.00,1440.00,Edwards Medical,Large Pullup Breifs,Large Pullup Breifs - Covidien,Apparel and Luggage and Personal Care Products,0
...,...,...,...,...,...,...,...,...
341206,1408.0,0.95,1341.12,McGinty Sales Company,RMA Cheese Lunch,RMA Cheese Lunch,Travel and Food and Lodging and Entertainment ...,1
341207,160.0,24.51,3921.60,Scofield Catering & Management,Fire Meal-Dinner,Fire Meal Dinner,Travel and Food and Lodging and Entertainment ...,1
341208,1.0,180.00,180.00,"Class Act Alliance, Inc.",Interpreter,Contractor will provide a Captioner for a tele...,Travel and Food and Lodging and Entertainment ...,1
341209,27000.0,1.20,32400.00,McGinty Sales Company,Box Lunch Meat/Cheese,Box Lunch Meat/Cheese,Travel and Food and Lodging and Entertainment ...,1


#### Feature Engineering

Segment Target was reducted from 55 to 26 Segment categories. This was accomplished by the bottom 20% of categories by total rows neing grouped into either Other Goods or Other Services based on their commodity type. But first those categories needed to be found. A 47% reduction in spend categories resulting in only aggregating 7% of total spend. This was accomplished through the below. 

In [16]:
# create a dataframe with Segment, Services Total Price grouped by Segment with sum and count (number of rows) of Total Price 
segment_target_df = pd.DataFrame(df.groupby(['Segment Target','Services'])['Total Price'].agg(['sum','count']).reset_index())

# sort by count
segment_target_df= segment_target_df.sort_values(by=('count'), ascending = False)

# create a column for cummaltive amount of rows
segment_target_df['cum_sum_count'] = segment_target_df['count'].cumsum()

# create a column for cummaltive percent of rows
segment_target_df['cum_perc_count'] = round(100*segment_target_df['cum_sum_count']/segment_target_df['count'].sum(),1)

# create a column for cummaltive sum of total spend
segment_target_df['cum_sum_sum'] = segment_target_df['sum'].cumsum()

# create a column for cummaltive percent of total spend
segment_target_df['cum_perc_sum'] = round(100*segment_target_df['cum_sum_sum']/segment_target_df['sum'].sum(),1)

# display 
segment_target_df

Unnamed: 0,Segment Target,Services,sum,count,cum_sum_count,cum_perc_count,cum_sum_sum,cum_perc_sum
26,Information Technology Broadcasting and Teleco...,0,1928122000.0,30275,30275,9.3,1928122000.0,1.3
19,Food Beverage and Tobacco Products,0,466071200.0,27682,57957,17.8,2394193000.0,1.6
39,Office Equipment and Accessories and Supplies,0,125090000.0,27007,84964,26.0,2519283000.0,1.7
14,Engineering and Research and Technology Based ...,1,4270134000.0,16589,101553,31.1,6789417000.0,4.6
33,Medical Equipment and Accessories and Supplies,0,299421100.0,16231,117784,36.1,7088838000.0,4.8
20,Fuels and Fuel Additives and Lubricants and An...,0,322231400.0,15989,133773,41.0,7411069000.0,5.0
41,Paper Materials and Products,0,133683600.0,11054,144827,44.4,7544753000.0,5.1
5,Commercial and Military and Private Vehicles a...,0,1097559000.0,10314,155141,47.5,8642312000.0,5.9
11,Education and Training Services,1,3289646000.0,9959,165100,50.6,11931960000.0,8.1
6,Defense and Law Enforcement and Security and S...,0,340619400.0,9943,175043,53.6,12272580000.0,8.3


Using a nested where clause for segements with the grouping of bottom 20% of segments into either Other Goods or Other Services based on commmodity type

In [17]:
# using a nested where clause for segements with the grouping of bottom 20% of segments into either Other Goods or Other Services based on commmodity type
segment_target_df['Reduced Segment Target'] = np.where((segment_target_df['cum_perc_count']>80.5) & (segment_target_df['Services'] == 1), 'Other Services', \
                                                       np.where((segment_target_df['cum_perc_count']>80.5) & (segment_target_df['Services'] == 0), 'Other Goods', segment_target_df['Segment Target']))



In [18]:
segment_target_df

Unnamed: 0,Segment Target,Services,sum,count,cum_sum_count,cum_perc_count,cum_sum_sum,cum_perc_sum,Reduced Segment Target
26,Information Technology Broadcasting and Teleco...,0,1928122000.0,30275,30275,9.3,1928122000.0,1.3,Information Technology Broadcasting and Teleco...
19,Food Beverage and Tobacco Products,0,466071200.0,27682,57957,17.8,2394193000.0,1.6,Food Beverage and Tobacco Products
39,Office Equipment and Accessories and Supplies,0,125090000.0,27007,84964,26.0,2519283000.0,1.7,Office Equipment and Accessories and Supplies
14,Engineering and Research and Technology Based ...,1,4270134000.0,16589,101553,31.1,6789417000.0,4.6,Engineering and Research and Technology Based ...
33,Medical Equipment and Accessories and Supplies,0,299421100.0,16231,117784,36.1,7088838000.0,4.8,Medical Equipment and Accessories and Supplies
20,Fuels and Fuel Additives and Lubricants and An...,0,322231400.0,15989,133773,41.0,7411069000.0,5.0,Fuels and Fuel Additives and Lubricants and An...
41,Paper Materials and Products,0,133683600.0,11054,144827,44.4,7544753000.0,5.1,Paper Materials and Products
5,Commercial and Military and Private Vehicles a...,0,1097559000.0,10314,155141,47.5,8642312000.0,5.9,Commercial and Military and Private Vehicles a...
11,Education and Training Services,1,3289646000.0,9959,165100,50.6,11931960000.0,8.1,Education and Training Services
6,Defense and Law Enforcement and Security and S...,0,340619400.0,9943,175043,53.6,12272580000.0,8.3,Defense and Law Enforcement and Security and S...


In [19]:
# factorize reduced segement target for modelling 
segment_target_df['Segment Target Cat'] = segment_target_df['Reduced Segment Target'].factorize()[0]

segment_target_df['Segment Target Cat'] = segment_target_df['Segment Target Cat'].astype(str) 
#segment_target_df = segment_target_df.drop('Segment Target', axis=1)

#segment_target_df = segment_target_df.rename(columns={'Reduced Segment Target': 'Segment Target'})

In [20]:
df = pd.merge(df,segment_target_df[['Segment Target','Reduced Segment Target', 'Segment Target Cat']], how='left', left_on='Segment Target', right_on ='Segment Target')


Since the original dataset did not have goods and services, that column was dropped

In [21]:
# drop Goods and Services column
df = df.drop(['Services', 'Segment Target'], axis=1)

In [22]:
df = df.rename(columns={'Reduced Segment Target': 'Segment Target'})

#### Finalized Cleaned Dataset

Check to see if there are any nulls, the correct columns are present in the correct data type.  
  
The cleaned dataset has 326,208 rows with 8 columns, 3 float and 5 object.

In [29]:
# check info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 326308 entries, 0 to 326307
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Quantity            326308 non-null  float64
 1   Unit Price          326308 non-null  float64
 2   Total Price         326308 non-null  float64
 3   Supplier Name       326308 non-null  object 
 4   Item Name           326308 non-null  object 
 5   Item Description    326308 non-null  object 
 6   Segment Target      326308 non-null  object 
 7   Segment Target Cat  326308 non-null  object 
dtypes: float64(3), object(5)
memory usage: 22.4+ MB


In [23]:
# check to ensure there are no null values
df.isnull().sum()

Quantity              0
Unit Price            0
Total Price           0
Supplier Name         0
Item Name             0
Item Description      0
Segment Target        0
Segment Target Cat    0
dtype: int64

In [24]:
# check object statistics
df.describe(include=['object']).transpose()

Unnamed: 0,count,unique,top,freq
Supplier Name,326308,24430,Voyager Fleet Systems Inc,13756
Item Name,326308,170898,Medical Supplies,2882
Item Description,326308,207616,Medical Supplies,1361
Segment Target,326308,26,Other Goods,49833
Segment Target Cat,326308,26,24,49833


In [25]:
# check float statisitcs
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,326308.0,2043.375498,100873.9,0.004,1.0,1.0,7.0,20000000.0
Unit Price,326308.0,447573.094654,21833210.0,0.0,41.0,600.0,10189.35,7337038000.0
Total Price,326308.0,451793.696625,21833250.0,0.01,352.0,3851.955,14880.0,7337038000.0


In [26]:
# one last check 
df.head()

Unnamed: 0,Quantity,Unit Price,Total Price,Supplier Name,Item Name,Item Description,Segment Target,Segment Target Cat
0,1.0,3593.0,3593.0,"A&M Uniforms, Inc.",uniforms,uniforms,Other Goods,24
1,1.0,995.04,995.04,IBHP INCORPORATED,hat,hat,Other Goods,24
2,1.0,2777.6,2777.6,McKesson Medical - Surgical Minnesota Su,adult diapers,adult diapers,Other Goods,24
3,100.0,4.22,422.0,SUCCOR INTERNATIONAL,"Flashlight Pouch, Black",Condor # MA48-002,Other Goods,24
4,60.0,24.0,1440.0,Edwards Medical,Large Pullup Breifs,Large Pullup Breifs - Covidien,Other Goods,24


In [27]:
# save cleaned data to be used for modelling (see included notebook Modelling)
df.to_csv('GoC Spend Data Cleaned (Quantity, Unit Price, Total Price, Supplier Name, Item Name, Item Description, UNSPSC Segment).csv', index=False)

#### Modelling process can be found in the Modelling notebook