In [109]:
#Libraries and modules
import pandas as pd
import numpy as np
import dask.dataframe as dd
import csv
import altair as alt
import matplotlib.pyplot as plt

In [110]:
#Settings:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 1000)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

I had a lot of issues loading this data set. I will build up incrementally.
First, let's lazy load the file and read in columns directly. We will do it for a file that is restricted to about 18MB.

In [112]:
df = dd.read_csv("./data/originalcsv/splitfile.csv")

In [117]:
df.dtypes
df.columns
df.head(3)

Invoice/Item Number       object
Date                      object
Store Number               int64
Store Name                object
Address                   object
City                      object
Zip Code                  object
Store Location            object
County Number            float64
County                    object
Category                 float64
Category Name             object
Vendor Number              int64
Vendor Name               object
Item Number                int64
Item Description          object
Pack                       int64
Bottle Volume (ml)         int64
State Bottle Cost         object
State Bottle Retail       object
Bottles Sold               int64
Sale (Dollars)            object
Volume Sold (Liters)     float64
Volume Sold (Gallons)    float64
dtype: object

Index(['Invoice/Item Number', 'Date', 'Store Number', 'Store Name', 'Address', 'City', 'Zip Code', 'Store Location', 'County Number', 'County', 'Category', 'Category Name', 'Vendor Number', 'Vendor Name', 'Item Number', 'Item Description', 'Pack', 'Bottle Volume (ml)', 'State Bottle Cost', 'State Bottle Retail', 'Bottles Sold', 'Sale (Dollars)', 'Volume Sold (Liters)', 'Volume Sold (Gallons)'], dtype='object')

Unnamed: 0,Invoice/Item Number,Date,Store Number,Store Name,Address,City,Zip Code,Store Location,County Number,County,Category,Category Name,Vendor Number,Vendor Name,Item Number,Item Description,Pack,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,S29198800001,11/20/2015,2191,Keokuk Spirits,1013 MAIN,KEOKUK,52632,"1013 MAIN\nKEOKUK 52632\n(40.39978, -91.387531)",56.0,Lee,,,255,Wilson Daniels Ltd.,297,Templeton Rye w/Flask,6,750,$18.09,$27.14,6,$162.84,4.5,1.19
1,S29195400002,11/21/2015,2205,Ding's Honk And Holler,900 E WASHINGTON,CLARINDA,51632,"900 E WASHINGTON\nCLARINDA 51632\n(40.739238, ...",73.0,Page,,,255,Wilson Daniels Ltd.,297,Templeton Rye w/Flask,6,750,$18.09,$27.14,12,$325.68,9.0,2.38
2,S29050300001,11/16/2015,3549,Quicker Liquor Store,1414 48TH ST,FORT MADISON,52627,"1414 48TH ST\nFORT MADISON 52627\n(40.624226, ...",56.0,Lee,,,130,Disaronno International LLC,249,Disaronno Amaretto Cavalli Mignon 3-50ml Pack,20,150,$6.40,$9.60,2,$19.20,0.3,0.08


The following issues exist with the Data:

- Column Names need to be standardized (take out slashes).
- Store Location contains newlines, and has a nested GPS coordinate.
- The GPS coordinate could be split out and put into its own row.
- Some columns have NAs, OK.
- The cost fields have a Dollar Sign, so these are probably typed as strings/objects instead of floats.

Can pandas even read our dataset? Lets try it on the 10% reduced version. 

In [121]:
dfPand = pd.read_csv("./data/originalcsv/Iowa_Liquor_Sales_reduced.csv") #,na_filter=True,low_memory=False)
dfPand.head(3)

Unnamed: 0,Date,Store Number,City,Zip Code,County Number,County,Category,Category Name,Vendor Number,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,03/31/2016,5029,DAVENPORT,52806,82.0,Scott,1022100.0,TEQUILA,370,87152,Avion Silver,375,$9.99,$14.99,12,$179.88,4.5,1.19
1,03/31/2016,5029,DAVENPORT,52806,82.0,Scott,1022100.0,TEQUILA,395,89197,Jose Cuervo Especial Reposado Tequila,1000,$12.50,$18.75,2,$37.50,2.0,0.53
2,03/31/2016,4959,CEDAR FALLS,50613,7.0,Black Hawk,1071100.0,AMERICAN COCKTAILS,380,63959,Uv Blue Raspberry Lemonade Pet,1750,$5.97,$8.96,6,$53.76,10.5,2.77


Now lets try Dask. It should wrap pandas (in theory), and manage chunks around it.

Here we go (first with a set we know is clean, the reduced set). There was a mismatch of data types (probably between pandas and dask), so you have to specify them. Dask tells you what to write.

In [138]:
df = dd.read_csv("./data/originalcsv/Iowa_Liquor_Sales_reduced.csv",na_filter=True,dtype={'Category': 'float64',
       'Zip Code': 'object'})


In [139]:
df.tail(3)
df.columns

Unnamed: 0,Date,Store Number,City,Zip Code,County Number,County,Category,Category Name,Vendor Number,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
411427,01/05/2015,3631,AUDUBON,50025,5.0,Audubon,1081330.0,PEACH SCHNAPPS,65,82847,Dekuyper Peachtree,1000,$7.62,$11.43,2,$22.86,2.0,0.53
411428,01/05/2015,2517,NEWTON,50208,50.0,Jasper,1071100.0,AMERICAN COCKTAILS,395,58868,Jose Cuervo Authentic Strawberry Margarita,1750,$8.20,$12.30,6,$73.80,10.5,2.77
411429,01/05/2015,2643,WATERLOO,50701,7.0,Black Hawk,1012200.0,SCOTCH WHISKIES,260,5329,Johnnie Walker Blue,750,$130.00,$195.00,1,$195.00,0.75,0.2


Index(['Date', 'Store Number', 'City', 'Zip Code', 'County Number', 'County', 'Category', 'Category Name', 'Vendor Number', 'Item Number', 'Item Description', 'Bottle Volume (ml)', 'State Bottle Cost', 'State Bottle Retail', 'Bottles Sold', 'Sale (Dollars)', 'Volume Sold (Liters)', 'Volume Sold (Gallons)'], dtype='object')

First, lets check: Are some of the columns missing or not? **They are!** There is no inventory number column. About 6 columns are missing. They are:
- Invoice/Item Number
- Store Name
- Store Location
- Address
- Vendor Name
- Pack

These columns are interesting as they are all difficult string columns. They contains single quotes, new lines and even GPS coordinates in one case. It is strange that Pack

What happens when we specify the dtypes of the columns explicitly? Still don't load




In [1]:
typeDict = {"Invoice/Item Number":"object"
,"Date":"object"
,"Store Number":"int64"
,"Store Name":"object"
,"Address":"object"
,"City":"object"
,"Zip Code":"object"
,"Store Location":"object"
,"County Number":"float64"
,"County":"object"
,"Category":"float64"
,"Category Name":"object"
,"Vendor Number":"int64"
,"Vendor Name":"object"
,"Item Number":"int64"
,"Item Description":"object"
,"Pack":"float64"
,"Bottle Volume (ml)":"int64"
,"State Bottle Cost":"object"
,"State Bottle Retail":"object"
,"Bottles Sold":"int64"
,"Sale (Dollars)":"object"
,"Volume Sold (Liters)":"float64"
,"Volume Sold (Gallons)":"float64"}

df = dd.read_csv("./data/originalcsv/Iowa_Liquor_Sales_reduced.csv",dtype=typeDict) #na_filter=True
df.columns

NameError: name 'dd' is not defined

The real test: the full dataset. Dask threw an type mismatch error again, the typeDict corrects the problem. Columns with problems were upcast, essentially.  The engine=python option corrects a C Parser EOF error. Somewhere in the file, there is a parsing error (unknown cause). So I skip the lines with error_bad_lines=False.

We can see that Dask is lazy, because it returns quickly. In order to get Dask to *do anything*, you have to use the compute() method!

In [135]:
#The real test: Full data set.
typeDict = {'Category': 'float64',
       'Zip Code': 'object',
       'Bottle Volume (ml)': 'float64',
       'Bottles Sold': 'float64',
       'Item Number': 'float64',
       'Pack': 'float64',
       'Store Number': 'float64',
       'Vendor Number': 'object'}

df = dd.read_csv("./data/originalcsv/Iowa_Liquor_Sales.csv", #dask args
                 engine="python", dtype=typeDict,
                 error_bad_lines=False) #pandas args na_filter=True

#Dask again, does not read the whole file!
#df.tail(10)
#https://stackoverflow.com/questions/49274161/read-tail-by-partition-from-csv-file-with-dask-dataframe
#you have to specify the partition, thats why!

df['Pack'].describe().compute()

Skipping line 232331: unexpected end of data
Skipping line 232318: unexpected end of data
Skipping line 232363: unexpected end of data
Skipping line 232346: unexpected end of data
Skipping line 232347: unexpected end of data
Skipping line 232424: unexpected end of data
Skipping line 2: ',' expected after '"'
Skipping line 232285: unexpected end of data
Skipping line 232419: unexpected end of data
Skipping line 232329: unexpected end of data
Skipping line 232335: unexpected end of data
Skipping line 232320: unexpected end of data
Skipping line 232355: unexpected end of data
Skipping line 232384: unexpected end of data
Skipping line 232354: unexpected end of data
Skipping line 232389: unexpected end of data
Skipping line 231812: unexpected end of data
Skipping line 232775: unexpected end of data
Skipping line 232342: unexpected end of data
Skipping line 232401: unexpected end of data
Skipping line 232175: unexpected end of data
Skipping line 233342: unexpected end of data
Skipping line 2

Traceback (most recent call last):
  File "/home/user/Documents/Workspace/CodeProjects/Python3/DSDEenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-136-dc97b872e0d9>", line 20, in <module>
    df['Pack'].describe().compute()
  File "/home/user/Documents/Workspace/CodeProjects/Python3/DSDEenv/lib/python3.7/site-packages/dask/base.py", line 175, in compute
    (result,) = compute(self, traverse=False, **kwargs)
  File "/home/user/Documents/Workspace/CodeProjects/Python3/DSDEenv/lib/python3.7/site-packages/dask/base.py", line 446, in compute
    results = schedule(dsk, keys, **kwargs)
  File "/home/user/Documents/Workspace/CodeProjects/Python3/DSDEenv/lib/python3.7/site-packages/dask/threaded.py", line 82, in get
    **kwargs
  File "/home/user/Documents/Workspace/CodeProjects/Python3/DSDEenv/lib/python3.7/site-packages/dask/local.py", line 480, in get_async
    key, res_i

  File "/home/user/Documents/Workspace/CodeProjects/Python3/DSDEenv/lib/python3.7/site-packages/IPython/core/ultratb.py", line 313, in wrapped
    return f(*args, **kwargs)
  File "/home/user/Documents/Workspace/CodeProjects/Python3/DSDEenv/lib/python3.7/site-packages/IPython/core/ultratb.py", line 347, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/usr/local/lib/python3.7/inspect.py", line 1502, in getinnerframes
    frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)
  File "/usr/local/lib/python3.7/inspect.py", line 1460, in getframeinfo
    filename = getsourcefile(frame) or getfile(frame)
  File "/usr/local/lib/python3.7/inspect.py", line 693, in getsourcefile
    if os.path.exists(filename):
  File "/usr/local/lib/python3.7/genericpath.py", line 19, in exists
    os.stat(path)
KeyboardInterrupt


TypeError: can only concatenate str (not "list") to str

OK so dask does its delayed computations. Lets first check the data with describe():

In [133]:
df['Store Number'].describe().compute()
df.npartitions
df.ndim

count    2.709552e+06
mean     3.589071e+03
std      9.491083e+02
min      2.106000e+03
25%      2.606000e+03
50%      3.749000e+03
75%      4.480000e+03
max      9.023000e+03
Name: Store Number, dtype: float64

6

2

So there are quite a few issues with the output, above. 

- Our csv file has about 37768482/3 = 12 589 494 rows of data (each row takes up 3 lines) of data. Yet, we only have ~ 2,700 000 entries on our count. 2.7M * 6 is also > 13M, so its not that we are reporting one partition (unless they are the same size). 
- Columns are missing. It shows 17 but there are 24 of them all together! Where are the other 6?



Index(['Date', 'Store Number', 'City', 'Zip Code', 'County Number', 'County', 'Category', 'Category Name', 'Vendor Number', 'Item Number', 'Item Description', 'Bottle Volume (ml)', 'State Bottle Cost', 'State Bottle Retail', 'Bottles Sold', 'Sale (Dollars)', 'Volume Sold (Liters)', 'Volume Sold (Gallons)'], dtype='object')

### References:

1) Dealing with C Tokenizer EOF Error: https://www.shanelynn.ie/pandas-csv-error-error-tokenizing-data-c-error-eof-inside-string-starting-at-line/
https://stackoverflow.com/questions/18016037/pandas-parsererror-eof-character-when-reading-multiple-csv-files-to-hdf5/53173373#53173373
