# Aggie Reuse Store FQ '21: Sales Data Cleaning

Read-in entire sales data for the quarter, standardize and output a clean .csv for other analyses.

In [1]:
# Libraries
import pandas as pd

In [2]:
# Dataframe
df = pd.read_csv('../Data/entire.csv')
df.head()

Unnamed: 0,Date,Item,Total Price
0,9/24/21,jeans,$5.00
1,9/24/21,boutique item,$6.00
2,9/24/21,boutique item,$7.00
3,9/24/21,button-down work shirt,$5.00
4,9/24/21,women's top,$3.00


In [3]:
# View categorical attributes
print('Unique Items:', '\n') 
print(sorted(df.Item.unique()), '\n')
print('Number of Occurrences:', '\n')
print(df.Item.value_counts())

Unique Items: 

[' costume', 'UCD t-shirts', 'blouse', 'blouses', 'botique item', 'boutique Item', 'boutique item', 'button down shirt', 'button-down work shirt', 'chemistry set', 'costume', 'costume ', 'dresses (all)', 'dresses(all)', 'earing (pair)', 'earings', 'earring (pair)', 'earrings (pair)', 'erasers', 'flashlight', 'glue sticks', 'hair clip', 'handbags', 'heels', 'highlighters', 'jackets', 'jeans', 'leggings', 'long skirt', 'long sleeve shirt', 'long sleeve shirts', 'notebook', 'notepads', 'pants', 'pencil sharpener', 'shirt', 'shoes', 'short skirt', 'short sleeve', 'short sleeve top', 'short sleeve tops', 'shorts', 'slacks', 'sneakers', 'sun hat', 'sunglasses', 'sweater/cardigan', 'sweaters/cardigans', 't-shirt', 'tank-top', 'vase', 'water bottle', 'water bottle ', 'water bottles', 'windbreakers', "women's top"] 

Number of Occurrences: 

boutique item             18
shorts                    11
water bottle              11
t-shirt                   10
long sleeve shirt      

In [7]:
# Create dictionary to remap 'Item' column to match Main Price List
d = {'blouse': 'blouses', 
     'botique item': 'boutique item',
     'boutique Item': 'boutique item', 
     'button down shirt': 'flannels', 
     # ^^ button down shirts aren't on Main Price List? Could be typo bc 'flannel' appears twice
     'button-down work shirt': 'flannels',
     'chemistry set': 'school_office_supplies', # map unknown items to parent category
     'costume': 'boutique item', # costume (3 instances) also not on Main Price List
     ' costume': 'boutique item',
     'costume ': 'boutique item',
     'dresses(all)': 'dresses (all)',
     'earing (pair)':'earring (pair)',
     'earings':'earring (pair)',
     'earrings (pair)':'earring (pair)',
     'flashlight': 'household_goods', 
     'hair clip': 'accessories',
     'jackets': 'Jackets',
     'long skirt': 'long skirts',
     'long sleeve shirt': 'long sleeve shirts',
     'notepads': 'notepad',
     'short skirt': 'short skirts',
     'short sleeve': 't-shirt',
     'short sleeve tops': 't-shirt',
     'sweater/cardigan': 'sweaters/cardigans',
     'water bottle': 'water bottles',
     'water bottle ': 'water bottles',
     "women's top": 'short_sleeve_tops'}

df.replace({'Item': d}, inplace = True)

print('Unique Items:', '\n') 
print(sorted(df.Item.unique()), '\n')
print('Number of Occurrences:', '\n')
print(df.Item.value_counts())

Unique Items: 

['Jackets', 'UCD t-shirts', 'blouses', 'boutique item', 'bracelet', 'dresses (all)', 'earring (pair)', 'erasers', 'flannels', 'glue sticks', 'handbags', 'heels', 'highlighters', 'jeans', 'leggings', 'long skirts', 'long sleeve shirts', 'miscellaneous items', 'notebook', 'notepad', 'pants', 'pencil sharpener', 'school_office_supplies', 'shirt', 'shoes', 'short skirts', 'short sleeve top', 'short_sleeve_tops', 'shorts', 'slacks', 'sneakers', 'sun hat', 'sunglasses', 'sweaters/cardigans', 't-shirt', 'tank-top', 'vase', 'water bottles', 'windbreakers'] 

Number of Occurrences: 

boutique item             24
water bottles             14
sweaters/cardigans        13
t-shirt                   12
shorts                    11
long sleeve shirts        11
dresses (all)             11
earring (pair)             9
flannels                   7
notebook                   6
tank-top                   5
blouses                    5
pants                      5
jeans                    

In [8]:
# TODO: read-in data dictionary and merge with 'Category' attribute, see if prices are correct, etc.
data_dict = pd.read_csv('./data_dict.csv')



Unnamed: 0,Category,Item,Clothing,Price
0,headwear,sun hats,1,3.0
1,headwear,beanies,1,3.0
2,headwear,baseball caps,1,3.0
3,hoodies_coats,Jackets,1,5.0
4,hoodies_coats,windbreakers,1,5.0
