# Aggie Reuse Store FQ '21: Sales Data Cleaning

Read-in entire sales data for the quarter, standardize and output a clean .csv for other analyses.

In [22]:
# Libraries
import pandas as pd

In [26]:
# Dataframe
df = pd.read_csv('../Data/entire.csv')
df.head()

Unnamed: 0,Date,Item,Total Price
0,10/1/21,short skirt,$3.00
1,10/1/21,blouses,$5.00
2,10/1/21,short sleeve,$3.00
3,10/1/21,earrings (pair),$3.00
4,10/1/21,blouses,$5.00


In [27]:
# View categorical attributes
print('Unique Items:', '\n') 
print(sorted(df.Item.unique()), '\n')
print('Number of Occurrences:', '\n')
print(df.Item.value_counts())

Unique Items: 

['UCD t-shirts', 'blouse', 'blouses', 'botique item', 'boutique Item', 'boutique item', 'button down shirt', 'button-down work shirt', 'costume', 'dresses (all)', 'dresses(all)', 'earing (pair)', 'earings', 'earring (pair)', 'earrings (pair)', 'erasers', 'flashlight', 'glue sticks', 'hair clip', 'handbags', 'heels', 'highlighters', 'jackets', 'jeans', 'leggings', 'long skirt', 'long sleeve shirt', 'long sleeve shirts', 'notebook', 'notepads', 'pants', 'pencil sharpener', 'shirt', 'shoes', 'short skirt', 'short sleeve', 'short sleeve tops', 'shorts', 'slacks', 'sneakers', 'sun hat', 'sunglasses', 'sweater/cardigan', 'sweaters/cardigans', 't-shirt', 'tank-top', 'vase', 'water bottle', 'water bottles', 'windbreakers'] 

Number of Occurrences: 

boutique item             15
water bottle              12
long sleeve shirt         10
t-shirt                   10
shorts                    10
sweaters/cardigans         7
sweater/cardigan           5
notebook                   5


In [28]:
# Create dictionary to remap 'Item' column to match Main Price List
d = {'blouse': 'blouses', 
     'botique item': 'boutique item',
     'boutique Item': 'boutique item', 
     'button down shirt': 'flannels', 
     # ^^ button down shirts aren't on Main Price List? Could be typo bc 'flannel' appears twice
     'button-down work shirt': 'flannels',
     'costume': 'boutique item', # costume (3 instances) also not on Main Price List
     'dresses(all)': 'dresses (all)',
     'earing (pair)':'earring (pair)',
     'earings':'earring (pair)',
     'earrings (pair)':'earring (pair)',
     'flashlight': 'miscellaneous items',
     'jackets': 'Jackets',
     'long skirt': 'long skirts',
     'long sleeve shirt': 'long sleeve shirts',
     'notepads': 'notepad',
     'short skirt': 'short skirts',
     'short sleeve': 't-shirt',
     'short sleeve tops': 't-shirt',
     'sweater/cardigan': 'sweaters/cardigans',
     'water bottle': 'water bottles'}

df.replace({'Item': d}, inplace=True)

print('Unique Items:', '\n') 
print(sorted(df.Item.unique()), '\n')
print('Number of Occurrences:', '\n')
print(df.Item.value_counts())

Unique Items: 

['Jackets', 'UCD t-shirts', 'blouses', 'boutique item', 'dresses (all)', 'earring (pair)', 'erasers', 'flannels', 'glue sticks', 'hair clip', 'handbags', 'heels', 'highlighters', 'jeans', 'leggings', 'long skirts', 'long sleeve shirts', 'miscellaneous items', 'notebook', 'notepad', 'pants', 'pencil sharpener', 'shirt', 'shoes', 'short skirts', 'shorts', 'slacks', 'sneakers', 'sun hat', 'sunglasses', 'sweaters/cardigans', 't-shirt', 'tank-top', 'vase', 'water bottles', 'windbreakers'] 

Number of Occurrences: 

boutique item          21
water bottles          14
sweaters/cardigans     12
t-shirt                12
long sleeve shirts     11
shorts                 10
earring (pair)          7
flannels                6
dresses (all)           6
notebook                5
tank-top                5
blouses                 5
pants                   4
jeans                   3
shoes                   2
slacks                  2
long skirts             2
sunglasses              2


In [None]:
# TODO: read-in data dictionary and merge with 'Category' attribute, see if prices are correct, etc.