# Aggie Reuse Store FQ '21: Sales Data Cleaning

Read-in entire sales data for the quarter, standardize and output a clean .csv for other analyses.

In [1]:
# Libraries
import pandas as pd

In [2]:
# Dataframe
df = pd.read_csv('../Data/Fall2021_sales_toread.csv')
df.head()
df.columns = ['Date', 'Item', 'Price']

In [3]:
# View categorical attributes
print('Unique Items:', '\n') 
print(sorted(df.Item.unique()), '\n')
print('Number of Occurrences:', '\n')
print(df.Item.value_counts())

('Unique Items:', '\n')
([' special patch', ' t-shirt', 'UCD t-shirt', 'athletic shirt', 'backpack', 'bag', 'blazer', 'blouse', 'boutique item', 'bracelet', 'button-down shirt', 'coat', 'costume', 'dress', 'dress shoes', 'figurine', 'hand bag', 'hat', 'headwear', 'hoodie', 'jacket', 'jeans', 'laptop case', 'long skirt', 'long sleeve shirt', 'long-sleeve shirt', 'misc', 'necklace', 'pants', 'paperback book', 'pencil', 'pencil case', 'ring', 'ruler', 'shoes', 'short skirt', 'shorts', 'slacks', 'sneakers', 'special patch', 'sunglasses', 'sweater', 'sweatshirt', 't-shirt', 'tank-top', 'textbook', 'water bottle', 'windbreaker'], '\n')
('Number of Occurrences:', '\n')
t-shirt              38
pants                15
button-down shirt     6
dress                 5
sweater               5
long sleeve shirt     4
tank-top              4
water bottle          4
jeans                 4
paperback book        3
short skirt           3
misc                  2
shorts                2
ring             

In [4]:
data_dict = pd.read_csv('./data_dict.csv')
data_dict.tail()

Unnamed: 0,Category,Item,Clothing,Price
160,sporting_goods,U bike lock,0,10.0
161,sporting_goods,bike seat,0,10.0
162,sporting_goods,tennis rackets,0,10.0
163,miscellaneous,misc,0,
164,boutique_item,boutique item,1,


In [5]:
to_rep = []
for item in df.Item.unique():
    if not(item in data_dict['Item'].values) and not(item in data_dict['Category'].values):
        to_rep.append(item)
to_rep
        
    

['figurine',
 'long-sleeve shirt',
 'costume',
 'backpack',
 'hat',
 ' special patch',
 'special patch',
 'bag',
 ' t-shirt']

In [6]:
# Create dictionary to remap 'Item' column to match Main Price List
d = {'figurine':'misc', 'long-sleeve shirt': 'long sleeve shirt', 'costume':'boutique item', 'backpack':'misc', 
     'hat': 'baseball cap', 'bag':'hand bag', ' special patch': 'misc', 'special patch': 'misc', 'bag':'hand bag', 
     ' t-shirt':'t-shirt' }

df.replace({'Item': d}, inplace = True)

print('(Re-mapped) Unique Items:', '\n') 
print(sorted(df.Item.unique()), '\n')
print('(Re-mapped) Number of Occurrences:', '\n')
print(df.Item.value_counts())

('(Re-mapped) Unique Items:', '\n')
(['UCD t-shirt', 'athletic shirt', 'baseball cap', 'blazer', 'blouse', 'boutique item', 'bracelet', 'button-down shirt', 'coat', 'dress', 'dress shoes', 'hand bag', 'headwear', 'hoodie', 'jacket', 'jeans', 'laptop case', 'long skirt', 'long sleeve shirt', 'misc', 'necklace', 'pants', 'paperback book', 'pencil', 'pencil case', 'ring', 'ruler', 'shoes', 'short skirt', 'shorts', 'slacks', 'sneakers', 'sunglasses', 'sweater', 'sweatshirt', 't-shirt', 'tank-top', 'textbook', 'water bottle', 'windbreaker'], '\n')
('(Re-mapped) Number of Occurrences:', '\n')
t-shirt              40
pants                15
button-down shirt     6
long sleeve shirt     6
misc                  6
sweater               5
dress                 5
water bottle          4
jeans                 4
tank-top              4
short skirt           3
paperback book        3
hand bag              2
boutique item         2
sunglasses            2
hoodie                2
shorts                

In [7]:
to_rep = []
for item in df.Item.unique():
    if not(item in data_dict['Item'].values) and not(item in data_dict['Category'].values):
        to_rep.append(item)
to_rep
        
    

[]

In [8]:
# Read in data dictionary and add 'Category' attribute to main data frame
category_dict = pd.Series(data_dict.Category.values, index = data_dict.Item).to_dict()

def categorize(row):
    if row['Item'] in category_dict:
        return category_dict[row['Item']]
    elif row['Item'] in category_dict.values():
        return row['Item']
    elif row['Item'] == 'boutique item':
        return 'boutique_item'
    else:
        return 'uncategorized'
    
df['Category'] = df.apply(categorize, axis = 1)

In [9]:
df.head(20)

Unnamed: 0,Date,Item,Price,Category
0,10/29/2021,tank-top,$3.00,short_sleeve_tops
1,10/29/2021,t-shirt,$3.00,short_sleeve_tops
2,10/29/2021,misc,$9.00,miscellaneous
3,10/29/2021,t-shirt,$3.00,short_sleeve_tops
4,10/29/2021,pants,$5.00,pants
5,10/29/2021,button-down shirt,$5.00,long_sleeves_blouses
6,10/29/2021,button-down shirt,$5.00,long_sleeves_blouses
7,10/29/2021,misc,$6.00,miscellaneous
8,10/29/2021,hoodie,$5.00,hoodies_coats
9,10/29/2021,pants,$5.00,pants


In [10]:
df.loc[df['Category'] == 'uncategorized']
# hopefully this comes back empty and everything is well classified 

Unnamed: 0,Date,Item,Price,Category


In [11]:
# Add clothing y/n and suggested price from Main Price List to data frame
clothing_dict = pd.Series(data_dict.Clothing.values, index = data_dict.Category).to_dict()
clothing_dict['boutique_item'] = 1 # assuming boutique items are usually clothing (?)

def clothing(row):
    return clothing_dict[row['Category']]
    
df['Clothing'] = df.apply(clothing, axis = 1)


price_dict = pd.Series(data_dict.Price.values, index = data_dict.Item).to_dict()

def suggested_price(row):
    if row['Item'] in price_dict:
        return price_dict[row['Item']]
    else:
        return 'N/A'
    
df['Suggested Price'] = df.apply(suggested_price, axis = 1)
df.head(20)

Unnamed: 0,Date,Item,Price,Category,Clothing,Suggested Price
0,10/29/2021,tank-top,$3.00,short_sleeve_tops,1,3.0
1,10/29/2021,t-shirt,$3.00,short_sleeve_tops,1,3.0
2,10/29/2021,misc,$9.00,miscellaneous,0,
3,10/29/2021,t-shirt,$3.00,short_sleeve_tops,1,3.0
4,10/29/2021,pants,$5.00,pants,1,
5,10/29/2021,button-down shirt,$5.00,long_sleeves_blouses,1,5.0
6,10/29/2021,button-down shirt,$5.00,long_sleeves_blouses,1,5.0
7,10/29/2021,misc,$6.00,miscellaneous,0,
8,10/29/2021,hoodie,$5.00,hoodies_coats,1,5.0
9,10/29/2021,pants,$5.00,pants,1,


In [12]:
popup = pd.read_csv('../Data/popup.csv')
popup.head()

Unnamed: 0,Date,Item,Price,Category,Clothing,Suggested Price
0,9/24/21,jeans,$5.00,pants,1,5.0
1,9/24/21,boutique item,$6.00,boutique_item,1,
2,9/24/21,boutique item,$7.00,boutique_item,1,
3,9/24/21,button-down shirt,$5.00,long_sleeves_blouses,1,5.0
4,9/24/21,t-shirt,$3.00,short_sleeve_tops,1,3.0


In [13]:
print(popup.shape)
print(df.shape)

(168, 6)
(138, 6)


In [14]:
df = popup.append(df)


In [15]:
df.shape

(306, 6)

In [16]:
df.to_csv('../Data/clean_entire.csv', index = False)