In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd
import dask.array as da
import dask.bag as db

In [4]:
def data_prep(columns, filepath="./data/parquet/data-*.parquet"):
    ddf = dd.read_parquet("./data/parquet/data-*.parquet")
    
    return ddf.loc[:, columns]

In [99]:
ddf = dd.read_parquet("./data/parquet/data-*.parquet")

In [3]:
ddf.columns

Index(['index', 'Invoice/Item Number', 'Date', 'Store Number', 'Store Name',
       'Address', 'City', 'Zip Code', 'Store Location', 'County Number',
       'County', 'Category', 'Category Name', 'Vendor Number', 'Vendor Name',
       'Item Number', 'Item Description', 'Pack', 'Bottle Volume (ml)',
       'State Bottle Cost', 'State Bottle Retail', 'Bottles Sold',
       'Sale (Dollars)', 'Volume Sold (Liters)', 'Volume Sold (Gallons)'],
      dtype='object')

In [3]:
ddf.shape[0].compute()

23346088

In [380]:
invoice_info = ['Invoice/Item Number', 'Date', 'Store Number', 'County Number', 'Vendor Number', 'Item Number', 'Bottles Sold']

In [6]:
county_info = ['County Number', 'County']
vendor_info = ['Vendor Name', 'Vendor Number']
product_info = ['Item Number', 'Item Description', 'Pack',
                'Bottle Volume (ml)']
price_info = ['Item Number', 'State Bottle Cost', 'State Bottle Retail', "Date"]
store_info = ['Store Number', 'Store Name', 'Address', 'City', 'Zip Code', 
              'Store Location', 'County Number', "Date"]


In [381]:
ddf = data_prep(invoice_info)

In [383]:
ddf.to_parquet("./data/invoice/")

(None,)

## County Information Cleaning

In [5]:
ddf = ddf[county_info]
ddf = ddf.drop_duplicates().compute()

In [21]:
# changes to pd dataframe when size is small enough
# ddf[ddf['County'].isna()] = pd.Series(['None', 'None'])

In [15]:
# strange to see EL PASO? not a county in Iowa.
ddf[ddf['County Number'].isna()]

Unnamed: 0,County Number,County
2405,,
15387,,EL PASO


In [8]:
# remove this, add an NA county number just in case
ddf = ddf[~ddf['County Number'].isna()]
ddf = ddf.append(pd.Series({'County Number': 999, 'County': 'Unknown'}), ignore_index=True)
ddf

Unnamed: 0,County Number,County
0,92.0,Washington
1,63.0,Marion
2,17.0,Cerro Gordo
3,82.0,Scott
4,57.0,Linn
...,...,...
196,26.0,DAVIS
197,13.0,CALHOUN
198,93.0,WAYNE
199,2.0,ADAMS


In [12]:
# just checking other stuff, one is missing the o in Cerro Gordo.
ddf[ddf['County'].str.contains('Cerro')]

Unnamed: 0,County Number,County
2,17.0,Cerro Gordo
158,17.0,Cerro Gord


In [13]:
# check if any other ones have that same issue
ddf[ddf['County'].str.contains(' ')]

Unnamed: 0,County Number,County
2,17.0,Cerro Gordo
5,29.0,Des Moines
8,11.0,Buena Vista
15,74.0,Palo Alto
19,7.0,Black Hawk
88,89.0,Van Buren
118,7.0,BLACK HAWK
130,17.0,CERRO GORD
131,74.0,PALO ALTO
139,29.0,DES MOINES


In [11]:
# they do not, just drop the duplicates, making sure it behaves before saving
ddf.loc[ddf['County Number'].drop_duplicates().index]

Unnamed: 0,County Number,County
0,92.0,Washington
1,63.0,Marion
2,17.0,Cerro Gordo
3,82.0,Scott
4,57.0,Linn
...,...,...
95,41.0,Hancock
96,61.0,Madison
97,87.0,Taylor
98,36.0,Fremont


In [14]:
# go through with dropping the rest, then change the type to int, no need to save the float info
ddf = ddf.loc[ddf['County Number'].drop_duplicates().index]
ddf.loc[:, 'County Number'] = ddf['County Number'].astype(int)
ddf.columns = ['CountyNumber', 'County']

In [16]:
ddf.to_csv('./data/county.csv')

Now onto the next sub-dataframe, we repeat a lot of steps but with every one we have to at least take the time to make sure we're not missing anything as we chop the data up.

## Vendor Information Cleaning

In [21]:
ddf = data_prep(vendor_info)
ddf = ddf.drop_duplicates().compute()
ddf.columns = ['VendorName', 'VendorNumber']

In [60]:
idx_na = ddf['VendorNumber'].isna()
name_na = ddf['VendorName'].isna()

In [61]:
ddf[idx_na]

Unnamed: 0,VendorName,VendorNumber
10224,Reservoir Distillery,


In [62]:
orderly_idx = ddf[~idx_na]['VendorNumber'].sort_values()

orderly_idx

19765     10.0
26872     10.0
22376     14.0
37747     27.0
36267     33.0
         ...  
4760     977.0
8        978.0
29948    978.0
38618    987.0
6001     999.0
Name: VendorNumber, Length: 555, dtype: float64

In [65]:
# you can see from above that I could have chosen any number 20 or below,
# that was not 10, or 14. We choose 20, because we feel like it.
ddf[idx_na] = pd.Series(['Reservoir Distillery', 20])
ddf[idx_na]

Unnamed: 0,VendorName,VendorNumber
10224,Reservoir Distillery,20.0


In [66]:
ddf['VendorNumber'] = ddf['VendorNumber'].astype(int)

In [1]:
# seems like there are a lot more names than numbers
len(ddf['VendorName'].unique()) - len(ddf['VendorNumber'].unique())

NameError: name 'ddf' is not defined

In [83]:
# making sure that this works
np.where(ddf.VendorNumber.value_counts() > 1)
ddf.VendorNumber.value_counts().index[np.where(ddf.VendorNumber.value_counts() > 1)]

Int64Index([803, 214, 391, 192,  79, 114, 478, 389, 255, 154,
            ...
            208, 977, 198, 459, 107, 226, 381, 978, 495, 346],
           dtype='int64', length=126)

In [84]:
# look through the duped ids to make sure that 
dup_idx = ddf.VendorNumber.value_counts().index[np.where(ddf.VendorNumber.value_counts() > 1)]
sorted_ddf = ddf[ddf['VendorNumber'].isin(dup_idx)].sort_values('VendorNumber')

In [88]:
sorted_ddf = ddf[ddf['VendorNumber'].isin(dup_idx)].sort_values('VendorNumber')

In [94]:
# doing some manual peeking here, just to check how the data looks
sorted_ddf.iloc[100:120,:]

Unnamed: 0,VendorName,VendorNumber
27644,RUSSIAN STANDARD VODKA,239
23,"WILLIAM GRANT AND SONS, INC.",240
10594,"William Grant and Sons, Inc.",240
27053,William Grant & Sons Inc,240
24858,Filibuster Barrels LLC,244
41545,Dilawri Barrels LLC,244
207,Wilson Daniels Ltd.,255
22023,WILSON DANIELS LTD,255
28722,Infinium Spirits,255
14985,HAAS BROTHERS,256


In [95]:
ddf = ddf.drop_duplicates('VendorNumber')

In [35]:
# throw in a good ol' "missing" Unknown value.
ddf[name_na] = pd.Series(['Unknown', 999])

In [36]:
ddf[name_na]

Unnamed: 0,VendorName,VendorNumber


In [49]:
ddf[ddf['VendorName'].str.contains('Llc')]

Unnamed: 0,VendorName,VendorNumber


In [48]:
# fix the LLC to be the same as the other formatting
ddf[ddf['VendorName'].str.contains('Llc')] = pd.Series(['Fire Tail Brands, LLC', 194])

In [97]:
ddf.to_csv('./data/vendor.csv')

And onto the next one!

## Price Information Cleaning
- This is still needed even if you are using the Iowa Liquor Products db, since there will be discontinued items in the invoices if you look at the invoices for all time.

In [7]:
ddf = data_prep(price_info)
ddf.columns = ['ItemNumber', 'StateBottleCost', 'StateBottleRetail', 'Date']

In [8]:
ddf.columns

Index(['ItemNumber', 'StateBottleCost', 'StateBottleRetail', 'Date'], dtype='object')

#ddf.drop_duplicates()

want unique prices for each item
then for each unique row look back and get a subset of the data that has that price
look for earliest and latest date to give a range / idea of what the price was
update again with range over the value in date

In [9]:
sub_ddf = ddf.drop_duplicates(ddf.columns[:3]).compute()

In [14]:
sub_ddf.Date = pd.to_datetime(sub_ddf['Date'])

Unnamed: 0,ItemNumber,StateBottleCost,StateBottleRetail,Date
0,35926,3.37,5.06,2013-05-30
1,23824,2.00,2.99,2014-03-27
2,12888,8.98,13.47,2014-09-04
3,48106,18.99,28.49,2015-04-15
4,53216,6.29,9.44,2013-08-15
...,...,...,...,...
9912,903245,3.38,5.07,2012-05-02
18427,904664,59.00,88.50,2012-02-13
27314,69661,2.67,4.00,2012-01-30
35285,900624,4.93,7.39,2013-10-24


In [112]:
sub_ddf.columns = ['ItemNumber', 'StateBottleCost', 'StateBottleRetail']

In [18]:
# looks like we can just go with the most recent data, but we'll save this df out just
# in case we want to do an analysis based on time periods of different prices.
sub_ddf[sub_ddf.ItemNumber.duplicated(keep=False)].sort_values(['ItemNumber', 'Date'], ascending = False)

Unnamed: 0,ItemNumber,StateBottleCost,StateBottleRetail,Date
23244,998546,22.50,33.75,2019-10-30
39375,998546,21.32,31.98,2018-10-02
17179,998074,24.00,36.00,2021-12-13
3143,998074,23.32,34.98,2021-10-19
14637,998071,24.00,36.00,2021-12-13
...,...,...,...,...
13401,100040,15.46,34.97,2019-09-26
13497,100040,14.50,34.97,2019-09-25
41041,100040,5.23,34.97,2019-09-24
14729,100017,23.62,35.43,2017-12-27


In [20]:
sub_ddf.to_csv('./data/unused/price_by_dates.csv')

In [21]:
sub_ddf.sort_values(['ItemNumber', 'Date'], ascending = False).drop_duplicates('ItemNumber')

Unnamed: 0,ItemNumber,StateBottleCost,StateBottleRetail,Date
25963,x904631,14.55,21.83,2016-10-11
34718,999995,17.00,25.50,2022-01-31
23430,999994,521.66,782.49,2022-02-08
12284,999993,21.25,31.88,2021-11-04
2798,999992,45.00,67.50,2021-09-10
...,...,...,...,...
14729,100017,23.62,35.43,2017-12-27
28373,100015,30.00,45.00,2019-03-11
38559,100006,23.32,34.98,2016-07-06
9973,100005,21.98,32.97,2016-10-05


In [None]:
# we want to make sure that it runs for a small section before unleashing it on the larger db.
# with so much data this cell will run for a while, but for the extra date information we can
# wait for the code to resolve.
%time
empty = pd.DataFrame()

for idx in unique_itemId:
    replace = ddf[ddf.ItemNumber == idx].compute()
    replace.Date = pd.to_datetime(replace['Date'])
    reduce = replace.groupby(["ItemNumber", "StateBottleCost", "StateBottleRetail"]).agg(['min', 'max']).reset_index()
    
    empty = empty.append(reduce)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.96 µs


In [155]:
len(empty.ItemNumber.unique())

54

In [157]:
trimmed = [x for x in unique_itemId if x not in empty.ItemNumber.unique()]

In [159]:
len(trimmed) + len(empty.ItemNumber.unique()) == len(unique_itemId)

True

In [163]:
for idx in trimmed[20:]:
    replace = ddf[ddf.ItemNumber == idx].compute()
    replace.Date = pd.to_datetime(replace['Date'])
    reduce = replace.groupby(["ItemNumber", "StateBottleCost", "StateBottleRetail"]).agg(['min', 'max']).reset_index()
    
    empty = empty.append(reduce)

KeyboardInterrupt: 

In [164]:
empty

Unnamed: 0_level_0,ItemNumber,StateBottleCost,StateBottleRetail,Date,Date
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,min,max
0,100017,22.04,33.06,2016-06-01,2016-08-23
1,100017,23.62,35.43,2016-09-08,2018-04-17
0,100040,5.23,34.97,2019-09-24,2019-09-24
1,100040,14.50,34.97,2019-09-25,2019-09-25
2,100040,15.46,34.97,2019-09-26,2019-09-26
...,...,...,...,...,...
1,433,9.97,14.96,2012-11-05,2012-12-31
0,43302,10.00,15.00,2016-08-01,2022-02-28
1,43302,11.00,16.50,2012-03-12,2016-07-29
0,43308,4.80,7.20,2015-11-03,2016-03-30


In [172]:
if "":
    print('yes')

In [168]:
empty.to_csv('./data/price_part_1.csv')

In [165]:
trimmed = [x for x in unique_itemId if x not in empty.ItemNumber.unique()]

In [169]:
len(empty.ItemNumber.unique())

1368

In [167]:
len(trimmed)

2563

Just going to run this one on the desktop and hope for the best. If I don't get back to this section I don't get back to this section.

In [183]:
ddf = data_prep(store_info)

In [184]:
sub_ddf = ddf.drop('Date', axis = 1)

In [185]:
sub_ddf = sub_ddf.drop_duplicates().compute()

In [187]:
sub_ddf.columns = sub_ddf.columns.str.replace(" ", "")

In [188]:
len(sub_ddf.StoreNumber.unique()) == sub_ddf.shape[0]

False

In [190]:
print(len(sub_ddf.StoreNumber.unique()))

2705


In [194]:
sub_ddf.StoreNumber.value_counts().sort_values(ascending = False)

4378    10
3682     8
3420     8
4478     7
2238     7
        ..
6104     1
6225     1
5874     1
5906     1
5629     1
Name: StoreNumber, Length: 2705, dtype: int64

In [196]:
sum(sub_ddf.StoreNumber.value_counts().sort_values(ascending = False) > 1)

1860

In [206]:
# brief look at the data shows there are a couple of stores... but only the Kum & Go on 650W Hickman seems like the one that exists
sub_ddf[sub_ddf.StoreNumber == 4378]

Unnamed: 0,StoreNumber,StoreName,Address,City,ZipCode,StoreLocation,CountyNumber
3307,4378,Kum & Go #202 / 4th St Waukee,85 4TH ST,WAUKEE,50263.0,POINT (-93.882532 41.613918),25.0
31582,4378,Kum & Go #1202 / Waukee,650 W Hickman Rd,Waukee,50263.0,POINT (-93.887586 41.614927),25.0
11597,4378,Kum & Go #202 / 4th St Waukee,,,,,
14442,4378,Kum & Go #1202 / Waukee,650 W Hickman Rd,Waukee,50263.0,,25.0
29506,4378,Kum & Go #1202 / Waukee,650 W Hickman Rd,Waukee,50263.0,POINT (-93.886466 41.61492100000001),25.0
5782,4378,Kum & Go #1202 / Waukee,650 W Hickman Rd,Waukee,50263.0,POINT (-93.886466 41.614921),25.0
11153,4378,KUM & GO #95 / DE SOTO,650 W Hickman Rd,Waukee,50263.0,POINT (-93.887586 41.614927),25.0
34758,4378,Kum & Go #202 / 4th St Waukee,85 4th St,Waukee,50263.0,POINT (-93.882532 41.613918),25.0
16548,4378,Kum & Go #1202,650 W Hickman Rd,Waukee,50263.0,POINT (-93.887586 41.614927),25.0
21960,4378,Kum & Go #202 / 4th St Waukee,650 W Hickman Rd,Waukee,50263.0,POINT (-93.887586 41.614927),25.0


In [207]:
sub_ddf.merge(ddf, how = "left", left_index = True, right_index = True)

TypeError: Can only merge Series or DataFrame objects, a <class 'dask.dataframe.core.DataFrame'> was passed

In [223]:
date_ddf = ddf[['Date']].merge(sub_ddf, how = "right", left_index = True, right_index = True).compute()

In [224]:
date_ddf = date_ddf.reset_index()

In [225]:
date_ddf.Date = pd.to_datetime(date_ddf.Date)

In [226]:
date_ddf

Unnamed: 0,index,Date,StoreNumber,StoreName,Address,City,ZipCode,StoreLocation,CountyNumber
0,554,2015-11-10,5028,Midtown Liquor,"1100, E 5th St",Storm Lake,50588,POINT (-95.186938 42.644563),11.0
1,554,2013-09-25,5028,Midtown Liquor,"1100, E 5th St",Storm Lake,50588,POINT (-95.186938 42.644563),11.0
2,554,2012-08-14,5028,Midtown Liquor,"1100, E 5th St",Storm Lake,50588,POINT (-95.186938 42.644563),11.0
3,554,2012-03-05,5028,Midtown Liquor,"1100, E 5th St",Storm Lake,50588,POINT (-95.186938 42.644563),11.0
4,554,2013-03-28,5028,Midtown Liquor,"1100, E 5th St",Storm Lake,50588,POINT (-95.186938 42.644563),11.0
...,...,...,...,...,...,...,...,...,...
3471757,40501,2013-02-26,5417,Casey's General Store # 2792/Cedar Rapids,9001 6th Street SW,Cedar Rapids,52404,POINT (-91.675979 41.88924),57.0
3471758,40501,2013-10-28,5417,Casey's General Store # 2792/Cedar Rapids,9001 6th Street SW,Cedar Rapids,52404,POINT (-91.675979 41.88924),57.0
3471759,40501,2012-02-29,5417,Casey's General Store # 2792/Cedar Rapids,9001 6th Street SW,Cedar Rapids,52404,POINT (-91.675979 41.88924),57.0
3471760,40501,2012-11-12,5417,Casey's General Store # 2792/Cedar Rapids,9001 6th Street SW,Cedar Rapids,52404,POINT (-91.675979 41.88924),57.0


In [228]:
date_ddf[['index', 'StoreNumber', 'Date', 'StoreName']]

Unnamed: 0,index,StoreNumber,Date,StoreName
0,554,5028,2015-11-10,Midtown Liquor
1,554,5028,2013-09-25,Midtown Liquor
2,554,5028,2012-08-14,Midtown Liquor
3,554,5028,2012-03-05,Midtown Liquor
4,554,5028,2013-03-28,Midtown Liquor
...,...,...,...,...
3471757,40501,5417,2013-02-26,Casey's General Store # 2792/Cedar Rapids
3471758,40501,5417,2013-10-28,Casey's General Store # 2792/Cedar Rapids
3471759,40501,5417,2012-02-29,Casey's General Store # 2792/Cedar Rapids
3471760,40501,5417,2012-11-12,Casey's General Store # 2792/Cedar Rapids


In [234]:
current_names = date_ddf.groupby(['StoreNumber', 'StoreName'])['Date'].agg(['max']).reset_index()

In [244]:
current_names[current_names.StoreNumber == 4378]

Unnamed: 0,StoreNumber,StoreName,max
886,4378,KUM & GO #95 / DE SOTO,2022-02-18
887,4378,Kum & Go #1202,2022-02-23
888,4378,Kum & Go #1202 / Waukee,2022-02-26
889,4378,Kum & Go #202 / 4th St Waukee,2022-02-25


In [251]:
sub_df = date_ddf[date_ddf.StoreNumber == 4378].dropna()

In [256]:
sub_df.sort_values('Date', ascending = False).iloc[0]

index                                    5782
Date                      2022-02-26 00:00:00
StoreNumber                              4378
StoreName             Kum & Go #1202 / Waukee
Address                      650 W Hickman Rd
City                                   Waukee
ZipCode                                 50263
StoreLocation    POINT (-93.886466 41.614921)
CountyNumber                             25.0
Name: 990713, dtype: object

In [261]:
date_ddf[date_ddf.StoreNumber == 4776]

Unnamed: 0,index,Date,StoreNumber,StoreName,Address,City,ZipCode,StoreLocation,CountyNumber
3396,8701,2013-07-15,4776,The Market on 30 / Carroll,420 WEST HWY 30,CARROLL,51401,,14.0
3397,8701,2014-03-24,4776,The Market on 30 / Carroll,420 WEST HWY 30,CARROLL,51401,,14.0
3398,8701,2013-07-25,4776,The Market on 30 / Carroll,420 WEST HWY 30,CARROLL,51401,,14.0
3399,8701,2015-06-22,4776,The Market on 30 / Carroll,420 WEST HWY 30,CARROLL,51401,,14.0
3400,8701,2013-07-10,4776,The Market on 30 / Carroll,420 WEST HWY 30,CARROLL,51401,,14.0
...,...,...,...,...,...,...,...,...,...
3957,8701,2012-04-18,4776,The Market on 30 / Carroll,420 WEST HWY 30,CARROLL,51401,,14.0
3958,8701,2015-10-28,4776,The Market on 30 / Carroll,420 WEST HWY 30,CARROLL,51401,,14.0
3959,8701,2012-10-22,4776,The Market on 30 / Carroll,420 WEST HWY 30,CARROLL,51401,,14.0
3960,8701,2013-03-19,4776,The Market on 30 / Carroll,420 WEST HWY 30,CARROLL,51401,,14.0


In [267]:
dump = pd.DataFrame()

for storeIdx in date_ddf.StoreNumber.unique():
    #print(storeIdx)
    sub_df = date_ddf[date_ddf.StoreNumber == storeIdx].dropna()
    
    if sub_df.shape[0] == 0:
        sub_df = date_ddf[date_ddf.StoreNumber == storeIdx].dropna(how='all')
    
    row = sub_df.sort_values('Date', ascending = False).iloc[0]
    
    dump = dump.append(row)

In [269]:
dump.to_csv('store_data.csv')

In [277]:
dump[~(dump['StoreLocation'].isna() & dump['Address'].isna())]

Unnamed: 0,index,Date,StoreNumber,StoreName,Address,City,ZipCode,StoreLocation,CountyNumber
2864239,31325.0,2022-02-27,5028.0,Midtown Liquor,"1100, E 5TH ST",STORM LAKE,50588,POINT (-95.186938 42.644563),11.0
2150651,9463.0,2022-02-28,4924.0,Abby Lea's,2757 Charles City Rd,Nashua,50658,POINT (-92.547524 42.959023),19.0
2430005,31764.0,2022-02-28,2844.0,CVS Pharmacy #8547 / Iowa City,2425 Muscatine Ave,Iowa City,52240,POINT (-91.50008800000002 41.649547),52.0
1857587,10068.0,2022-02-28,4320.0,Fareway Stores #470 / Perry,1315 Willis Ave,Perry,50220,POINT (-94.103123 41.838522),8.0
2501889,1764.0,2022-02-28,4267.0,Main Street Market Of Anita,735 Main St,Anita,50020,POINT (-94.765439 41.444849),15.0
...,...,...,...,...,...,...,...,...,...
3432484,6840.0,2022-02-28,9934.0,"Cats Eye Distillery, LLC",4860 Heatherstone Rd.,Bettendorf,52722,POINT (-90.455001 41.571059),82.0
3443802,29564.0,2022-02-22,5258.0,Oky Doky # 8 Foods,"535, Hill Street",Dubuque,52001,POINT (-90.673985 42.496794),31.0
3444936,175.0,2022-02-25,4038.0,No Frills Supermarkets / Denison,501 HIGHWAY 39 NORTH,DENISON,51442,POINT (-95.368652 42.02183),24.0
3454533,34445.0,2022-02-25,6233.0,Quick Stop / Cedar Rapids,1430 1st Ave NE,Cedar Rapids,52402,POINT (-91.652696 41.987875),57.0


In [275]:
dump[dump.StoreName == 'Central Grocery']

Unnamed: 0,index,Date,StoreNumber,StoreName,Address,City,ZipCode,StoreLocation,CountyNumber
31963,36118.0,2022-02-24,5320.0,Central Grocery,,,,,


In [278]:
dump = dump[~(dump['StoreLocation'].isna() & dump['Address'].isna())]

In [372]:
dump[dump.CountyNumber.isna()]

Unnamed: 0,index,Date,StoreNumber,StoreName,Address,City,ZipCode,StoreLocation,CountyNumber


In [377]:
dump.loc[:, 'StoreLocation'] = dump.StoreLocation.fillna("TBD")

## manual cleaning stuff

This section takes a while, you are just shuffling data around and checking via the internet if places exist or not.

In [371]:
dump[dump.StoreNumber == 9936]

Unnamed: 0,index,Date,StoreNumber,StoreName,Address,City,ZipCode,StoreLocation,CountyNumber


In [370]:
# Dash Events LLC is in Colorado Springs, not Iowa
dump = dump.drop(2704521)

In [364]:
# Liquor Tobacco & Grocery / Mason Cit
dump.loc[2673960,'CountyNumber'] = 17

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [367]:
dump[dump.City == 'Colorado Springs']

Unnamed: 0,index,Date,StoreNumber,StoreName,Address,City,ZipCode,StoreLocation,CountyNumber
2704521,15387.0,2022-02-22,9936,Dash Events LLC,1685 W Uintah St. #101,Colorado Springs,80904,POINT (-104.845334 38.848017),


In [366]:
dump[dump.ZipCode == '80904']

Unnamed: 0,index,Date,StoreNumber,StoreName,Address,City,ZipCode,StoreLocation,CountyNumber
2704521,15387.0,2022-02-22,9936,Dash Events LLC,1685 W Uintah St. #101,Colorado Springs,80904,POINT (-104.845334 38.848017),


In [355]:
# Station Mart Liquor & Tobacco
dump.loc[2725465, 'CountyNumber'] = 7
dump.loc[2725465, 'Address'] = '3594 Lafayette Road'
dump.loc[2725465, 'StoreLocation'] = 'POINT (-92.287994 42.47724500000001)'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [359]:
# MAD Ave Quik Shop
dump.loc[1845701, 'CountyNumber'] = 90
dump.loc[1845701, 'StoreLocation'] = 'POINT (-92.41041400000002 40.999034)'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [343]:
# Sauce
dump.loc[74979, 'CountyNumber'] = 52
dump.loc[74979, 'Address'] = '108 East College Street'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [314]:
# one that I forget
dump.loc[2928763, 'CountyNumber'] = 94

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [337]:
dump[dump.City == 'Brooklyn']

Unnamed: 0,index,Date,StoreNumber,StoreName,Address,City,ZipCode,StoreLocation,CountyNumber
229895,8308.0,2022-02-28,5713,Brooklyn Grocery Liquor LLC,122 East Front Street,Brooklyn,52211,POINT (-92.444377 41.728699),79.0
853729,10389.0,2022-02-28,5284,Brooklyn Grocery,"122, Front St.",Brooklyn,52211,POINT (-92.444377 41.728699),79.0
1481356,24122.0,2022-02-24,6061,Brooklyn Grocery,122 E Front St,Brooklyn,52211,POINT (-92.444377 41.728699),79.0


In [335]:
dump.loc[853729,'CountyNumber']= 79

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [336]:
# Brooklyn Grocery
dump.loc[853729,'StoreLocation'] = dump.loc[229895,:].StoreLocation
dump.loc[853729,'CountyNumber']= 79

In [378]:
dump.StoreNumber = dump.StoreNumber.astype(int)
dump.CountyNumber = dump.CountyNumber.astype(int)

In [379]:
dump.to_csv("./data/store_data.csv")

## Store Data Done! All data cleaning Complete!

Things we can still do to the data:
    
- add point locations for missing StoreLocations
- verify missingness