# Aggie Reuse Store FQ '21: Sales Data Cleaning

Read-in entire sales data for the quarter, standardize and output a clean .csv for other analyses.

In [1]:
# Libraries
import pandas as pd

In [2]:
# Dataframe
df = pd.read_csv('../Data/entire.csv')
df.head()

Unnamed: 0,Date,Item,Total Price
0,9/24/21,jeans,$5.00
1,9/24/21,boutique item,$6.00
2,9/24/21,boutique item,$7.00
3,9/24/21,button-down work shirt,$5.00
4,9/24/21,women's top,$3.00


In [3]:
# View categorical attributes
print('Unique Items:', '\n') 
print(sorted(df.Item.unique()), '\n')
print('Number of Occurrences:', '\n')
print(df.Item.value_counts())

Unique Items: 

[' costume', 'UCD t-shirts', 'blouse', 'blouses', 'botique item', 'boutique Item', 'boutique item', 'button down shirt', 'button-down work shirt', 'chemistry set', 'costume', 'costume ', 'dresses (all)', 'dresses(all)', 'earing (pair)', 'earings', 'earring (pair)', 'earrings (pair)', 'erasers', 'flashlight', 'glue sticks', 'hair clip', 'handbags', 'heels', 'highlighters', 'jackets', 'jeans', 'leggings', 'long skirt', 'long sleeve shirt', 'long sleeve shirts', 'notebook', 'notepads', 'pants', 'pencil sharpener', 'shirt', 'shoes', 'short skirt', 'short sleeve', 'short sleeve top', 'short sleeve tops', 'shorts', 'slacks', 'sneakers', 'sun hat', 'sunglasses', 'sweater/cardigan', 'sweaters/cardigans', 't-shirt', 'tank-top', 'vase', 'water bottle', 'water bottle ', 'water bottles', 'windbreakers', "women's top"] 

Number of Occurrences: 

boutique item             18
water bottle              11
shorts                    11
t-shirt                   10
long sleeve shirt      

In [4]:
# Create dictionary to remap 'Item' column to match Main Price List
d = {'blouse': 'blouses', 
     'botique item': 'boutique item',
     'boutique Item': 'boutique item', 
     'button down shirt': 'button-down work shirt', 
     'chemistry set': 'school_office_supplies', # map unknown items to parent category
     'costume': 'boutique item', # costume (3 instances) not on Main Price List
     ' costume': 'boutique item',
     'costume ': 'boutique item',
     'dresses(all)': 'dresses (all)',
     'earing (pair)':'earring (pair)',
     'earings':'earring (pair)',
     'earrings (pair)':'earring (pair)',
     'flashlight': 'household_goods', 
     'hair clip': 'accessories',
     'jackets': 'Jackets',
     'long skirt': 'long skirts',
     'long sleeve shirt': 'long sleeve shirts',
     'notepads': 'notepad',
     'shirt': 'short_sleeve_tops',
     'short skirt': 'short skirts',
     'short sleeve': 't-shirt',
     'short sleeve top': 't-shirt',
     'short sleeve tops': 't-shirt',
     'sun hat': 'sun hats',
     'sweater/cardigan': 'sweaters/cardigans',
     'water bottle': 'water bottles',
     'water bottle ': 'water bottles',
     "women's top": 'short_sleeve_tops'}

df.replace({'Item': d}, inplace = True)

print('(Re-mapped) Unique Items:', '\n') 
print(sorted(df.Item.unique()), '\n')
print('(Re-mapped) Number of Occurrences:', '\n')
print(df.Item.value_counts())

(Re-mapped) Unique Items: 

['Jackets', 'UCD t-shirts', 'accessories', 'blouses', 'boutique item', 'button-down work shirt', 'dresses (all)', 'earring (pair)', 'erasers', 'glue sticks', 'handbags', 'heels', 'highlighters', 'household_goods', 'jeans', 'leggings', 'long skirts', 'long sleeve shirts', 'notebook', 'notepad', 'pants', 'pencil sharpener', 'school_office_supplies', 'shoes', 'short skirts', 'short_sleeve_tops', 'shorts', 'slacks', 'sneakers', 'sun hats', 'sunglasses', 'sweaters/cardigans', 't-shirt', 'tank-top', 'vase', 'water bottles', 'windbreakers'] 

(Re-mapped) Number of Occurrences: 

boutique item             24
water bottles             14
sweaters/cardigans        13
t-shirt                   13
long sleeve shirts        11
shorts                    11
dresses (all)             11
earring (pair)             9
button-down work shirt     7
notebook                   6
blouses                    5
tank-top                   5
pants                      5
jeans           

In [5]:
# Read in data dictionary and add 'Category' attribute to main data frame
data_dict = pd.read_csv('./data_dict.csv')
category_dict = pd.Series(data_dict.Category.values,index = data_dict.Item).to_dict()

def categorize(row):
    if row['Item'] in category_dict:
        return category_dict[row['Item']]
    elif row['Item'] in category_dict.values():
        return row['Item']
    elif row['Item'] == 'boutique item':
        return 'boutique item'
    else:
        return 'uncategorized'
    
df['Category'] = df.apply(categorize, axis = 1)

In [6]:
df.head(20)

Unnamed: 0,Date,Item,Total Price,Category
0,9/24/21,jeans,$5.00,pants
1,9/24/21,boutique item,$6.00,boutique item
2,9/24/21,boutique item,$7.00,boutique item
3,9/24/21,button-down work shirt,$5.00,long_sleeves_blouses
4,9/24/21,short_sleeve_tops,$3.00,short_sleeve_tops
5,9/24/21,short_sleeve_tops,$3.00,short_sleeve_tops
6,9/24/21,pants,$5.00,pants
7,9/24/21,t-shirt,$3.00,short_sleeve_tops
8,9/24/21,boutique item,$6.00,boutique item
9,9/24/21,dresses (all),$6.00,skirts_shorts_dresses


In [7]:
df.loc[df['Category'] == 'uncategorized']
# hopefully this comes back empty and everything is well classified 

Unnamed: 0,Date,Item,Total Price,Category


In [8]:
df.to_csv('../Data/clean_entire.csv', index = False)