In [71]:
#Imports:
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt
import time
import dask.dataframe as dd
import os


In [3]:
#Lets make our console outputs more nice, by applying some settings.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
alt.renderers.enable('notebook')
alt.data_transformers.enable('default', max_rows=None)
%matplotlib inline 
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 1000)

Our Dataset is about 3.6GB on disk, and "wc -l datafile.csv" tells us that we have 37 768 482 rows. Each row in the file takes up about 3 lines, so we have about 12589494 potential rows to access, in the dataset.

In [64]:
#our type dictionary
typeDict = {"Invoice/Item Number":"object"
,"Date":"object"
,"Store Number":"int64"
,"Store Name":"object"
,"Address":"object"
,"City":"object"
,"Zip Code":"object"
,"Store Location":"object"
,"County Number":"float64"
,"County":"object"
,"Category":"float64"
,"Category Name":"object"
,"Vendor Number":"int64"
,"Vendor Name":"object"
,"Item Number":"int64"
,"Item Description":"object"
,"Pack":"int64"
,"Bottle Volume (ml)":"float64"
,"State Bottle Cost":"object"
,"State Bottle Retail":"object"
,"Bottles Sold":"int64"
,"Sale (Dollars)":"object"
,"Volume Sold (Liters)":"float64"
,"Volume Sold (Gallons)":"float64"}


#lets just read it in and see what happens.
dfILS = dd.read_csv("./data/originalcsv/Iowa_Liquor_Sales.csv", engine="python", dtype=typeDict,
                 error_bad_lines=False,encoding="utf-8") #pandas args na_filter=True

dfILS.head(10)

Unnamed: 0,Invoice/Item Number,Date,Store Number,Store Name,Address,City,Zip Code,Store Location,County Number,County,Category,Category Name,Vendor Number,Vendor Name,Item Number,Item Description,Pack,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,S29198800001,11/20/2015,2191,Keokuk Spirits,1013 MAIN,KEOKUK,52632,"1013 MAIN\nKEOKUK 52632\n(40.39978, -91.387531)",56.0,Lee,,,255,Wilson Daniels Ltd.,297,Templeton Rye w/Flask,6,750.0,$18.09,$27.14,6,$162.84,4.5,1.19
1,S29195400002,11/21/2015,2205,Ding's Honk And Holler,900 E WASHINGTON,CLARINDA,51632,"900 E WASHINGTON\nCLARINDA 51632\n(40.739238, ...",73.0,Page,,,255,Wilson Daniels Ltd.,297,Templeton Rye w/Flask,6,750.0,$18.09,$27.14,12,$325.68,9.0,2.38
2,S29050300001,11/16/2015,3549,Quicker Liquor Store,1414 48TH ST,FORT MADISON,52627,"1414 48TH ST\nFORT MADISON 52627\n(40.624226, ...",56.0,Lee,,,130,Disaronno International LLC,249,Disaronno Amaretto Cavalli Mignon 3-50ml Pack,20,150.0,$6.40,$9.60,2,$19.20,0.3,0.08
3,S28867700001,11/04/2015,2513,Hy-Vee Food Store #2 / Iowa City,812 S 1ST AVE,IOWA CITY,52240,812 S 1ST AVE\nIOWA CITY 52240\n,52.0,Johnson,,,65,Jim Beam Brands,237,Knob Creek w/ Crystal Decanter,3,1750.0,$35.55,$53.34,3,$160.02,5.25,1.39
4,S29050800001,11/17/2015,3942,Twin Town Liquor,104 HIGHWAY 30 WEST,TOLEDO,52342,"104 HIGHWAY 30 WEST\nTOLEDO 52342\n(41.985887,...",86.0,Tama,,,130,Disaronno International LLC,249,Disaronno Amaretto Cavalli Mignon 3-50ml Pack,20,150.0,$6.40,$9.60,2,$19.20,0.3,0.08
5,S28869200001,11/11/2015,3650,"Spirits, Stogies and Stuff",118 South Main St.,HOLSTEIN,51025,118 South Main St.\nHOLSTEIN 51025\n(42.490073...,47.0,Ida,,,65,Jim Beam Brands,237,Knob Creek w/ Crystal Decanter,3,1750.0,$35.55,$53.34,1,$53.34,1.75,0.46
6,S28865700001,11/09/2015,2538,Hy-Vee Food Store #3 / Waterloo,1422 FLAMMANG DR,WATERLOO,50702,"1422 FLAMMANG DR\nWATERLOO 50702\n(42.459938, ...",7.0,Black Hawk,1701100.0,DECANTERS & SPECIALTY PACKAGES,962,Duggan's Distillers Products Corp,238,Forbidden Secret Coffee Pack,6,1500.0,$11.62,$17.43,6,$104.58,9.0,2.38
7,S28869500001,11/10/2015,3942,Twin Town Liquor,104 HIGHWAY 30 WEST,TOLEDO,52342,"104 HIGHWAY 30 WEST\nTOLEDO 52342\n(41.985887,...",86.0,Tama,,,65,Jim Beam Brands,237,Knob Creek w/ Crystal Decanter,3,1750.0,$35.55,$53.34,2,$106.68,3.5,0.92
8,S29339300091,11/30/2015,2662,Hy-Vee Wine & Spirits / Muscatine,"522 MULBERRY, SUITE A",MUSCATINE,52761,"522 MULBERRY, SUITE A\nMUSCATINE 52761\n",70.0,Muscatine,1701100.0,DECANTERS & SPECIALTY PACKAGES,65,Jim Beam Brands,173,Laphroaig w/ Whiskey Stones,12,750.0,$19.58,$29.37,4,$117.48,3.0,0.79
9,S29050900001,11/16/2015,4307,Crossroads Wine and Liquor,117 IOWA AVE,DUNLAP,712-2,"117 IOWA AVE\nDUNLAP 712-2\n(41.854728, -95.60...",43.0,Harrison,,,130,Disaronno International LLC,249,Disaronno Amaretto Cavalli Mignon 3-50ml Pack,20,150.0,$6.40,$9.60,2,$19.20,0.3,0.08


In [65]:
dfILS["Store Number"].describe().compute()

Skipping line 232318: unexpected end of data
Skipping line 232331: unexpected end of data


ValueError: Unable to convert column Store Number to type int64

In [9]:
#function support:

#From: https://stackoverflow.com/questions/2081836/reading-specific-lines-only. Nice solution!
def yieldlines(thefile, whatlines):
  return (x for i, x in enumerate(thefile) if i in whatlines)

We see NaNs (that is fine). Lets try summarizing one column of the dataset (across all partitions).

Running: dfILS["Store Number"].describe().compute(), we get:

**"Skipping line 232331: unexpected end of data...Skipping line 232318"**

So around row 70k we have our first set of issues. I manually go through the file to see what is up.


In [54]:
ourFP = open("./data/originalcsv/Iowa_Liquor_Sales.csv")
linegen = yieldlines(ourFP,list(range(232210,232250)))
#linegen = yieldlines(["a","b","c","d","e","f","g","h","i"],[2,5,6])

In [55]:
for line in linegen:
    print(line)


S12113900061,05/08/2013,2626,Hy-Vee Drugstore / University / DSM,4100 UNIVERSITY AVE,DES MOINES,50311,"4100 UNIVERSITY AVE

DES MOINES 50311

(41.600361, -93.673223)",77,Polk,1011200,STRAIGHT BOURBON WHISKIES,65,Jim Beam Brands,19068,Jim Beam,6,1750,$19.42,$29.14,6,$174.84,10.50,2.77

S27023400005,07/29/2015,3825,Shop N Save #2 / E 14th,1372 E 14TH ST,DES MOINES,50316,"1372 E 14TH ST

DES MOINES 50316

(41.604893, -93.600499)",77,Polk,1031080,VODKA 80 PROOF,300,Mccormick Distilling Company,36903,Mccormick Vodka,48,200,$1.13,$1.70,48,$81.60,9.60,2.54

S18556100019,04/23/2014,4912,Casey's General Store #2531 / Eldrid,"840, E LE CLAIRE RD",ELDRIDGE,52748,"840, E LE CLAIRE RD

ELDRIDGE 52748

(41.654692, -90.572547)",82,Scott,1031080,VODKA 80 PROOF,260,Diageo Americas,37994,Smirnoff Vodka 80 Prf,24,375,$4.75,$7.13,12,$85.56,4.50,1.19

S27807300002,09/09/2015,4463,Casey's General Store #3031 / Garner,145 US HWY 18 W,GARNER,50438,"145 US HWY 18 W

GARNER 50438

(43.105833, -93.603007)",41,Ha

Check: Are there premature EOFs before the true end of the file?

In [57]:
#So our EOF error is not correct. Loop would have terminated before it hit 37M lines.
def eoflinecheck():
    ourFP = open("./data/originalcsv/Iowa_Liquor_Sales.csv")
    i = 0
    for line in ourFP:
        i += 1
    ourFP.close()
    return i

eoflinecheck()

37768482

Hypothesis: our line parser goes out of wack due to control characters, or some bad formatting of lines. Each new record starts with an invoice number that starts with the letter S. Is every record on three lines? Lets find out

In [126]:
#For all of our lines, they are constrained within three lines. OK.
def threelinecheck(path):
    ourFP = open(path)
    result=True
    
    
    for i,line in enumerate(ourFP):
        if ((i > 0) and (i+1 % 3 == 0) and (line[0] != "S")):
            result=i
            break
    ourFP.close()
    return result

threelinecheck("./data/originalcsv/Iowa_Liquor_Sales.csv")


True

So, we have to manually clip out bad lines from the file. Are the lines that throw errors the actual lines causing the problem? Or is the parser out of phase, and drifting down the file until it finally crashes? I actually dont know! I looked at the offending lines and I didn't see any bad formatting. I am going to make a quarentine radius about the lines, and just cut out entires. We will lose more data this way, but looking for dirty control characters is an even worse way to spend my day.

In [123]:
#Lets design our clipping funciton.

#Signature: Path[String], Line Number[Integer], UpperRowRadius[Integer] --> Tuple(LB,UB)
#Purpose, for a given offending LINE, define a number of ROWS above it to clip, and then append the 
#rest of the file. This function will only perform the clip operation for ONE offending row.
#Use dask to iteratively go through the file, and find all of the anomolies. Dask will eventually crash on line numbers, 
#so lets just dump them.

def clipappend(pathTo,fileName,lineNum,upRowRad):
    #if (not os.path.exists(pathto + "/temphold.csv")):
    ourFP = open(pathTo + "/" + fileName)
    tempFP = open(pathTo + "/temp.csv","w+") #will just create the file for us

    #endpoint set: 3,6,9,12 ... 3k
    #startpoint set: 1,4,7,10... 3k+1 I choose to calculate from ENDPOINT basis.
    
    shiftUp = 0
    #is our line Number an endLine for a given row?
    modShift = lineNum % 3 

    # If we are at first or middle position of a row.
    if (modShift == 1): #at start position
        shiftUp = 2 #to endpoint
    elif (modShift == 2): #middle position of a row
        shiftUp = 1 #to endpoint
    lowerBound = lineNum+shiftUp-((upRowRad+1)*3) #also remove the row itself from endpoint - thats why +1
    upperBound = lineNum+shiftUp 
    excludeRange = list(range(lowerBound+1,upperBound+1))
    #at this point, we have the cutoffs in the endpoint basis.
    for i,line in enumerate(ourFP):
        if (i in excludeRange):
            pass
        else:
            tempFP.write(line)
    ourFP.close()
    tempFP.close()
    
    os.remove(pathTo + "/" + fileName)
    os.rename(pathTo + "/temp.csv",pathTo + "/" + fileName)
    
    return (lowerBound,upperBound)

hold = clipappend("./data/originalcsv","splitfile.csv", 15, 1)


Now we have to reform datafile, using dask to tell us where the erroneous lines are! This takes a number of iterations by hand. For a set of numbers that are thrown (before crash) we calculate a range between the extremes, and extend it more to be safe.

Our start file is called "A.csv", our temporary file is called "temp.csv". We run dask on A until it crashes on a range of rows. We then call clipappend() on a, and give it the row number and epsilon rollback. When the function returns, a should be replaced by b, and b should be deleted. We keep moving forward until the entire file is generated.

In [124]:
#First test it out on the splitfile: "head -n 30001 ./Iowa_Liquor_Sales.csv > splitfile.csv"
"
clipappend("./data/originalcsv","splitfile.csv", 20000, 2000)

(13998, 20001)

In [127]:
#Now for a real test: do we have a properly formatted file? Does
#every third line have an invoice number? or did it get garbled?
threelinecheck("./data/originalcsv/splitfile.csv")

True

In [128]:
#Final Test: Lets make a splitfile: "head -n 300 ./Iowa_Liquor_Sales.csv > splitfile.csv"
#and manually check in a text editor that the rows are formatted correctly.
clipappend("./data/originalcsv","splitfile.csv", 220, 10)

(189, 222)

In [129]:
#So the file looked OK in a text editor. What about reading with pandas?
testDF = pd.read_csv("./data/originalcsv/splitfile.csv", engine="python", dtype=typeDict,
                 encoding="utf-8")


In [130]:
testDF #Looks good!

Unnamed: 0,Invoice/Item Number,Date,Store Number,Store Name,Address,City,Zip Code,Store Location,County Number,County,Category,Category Name,Vendor Number,Vendor Name,Item Number,Item Description,Pack,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,S29198800001,11/20/2015,2191,Keokuk Spirits,1013 MAIN,KEOKUK,52632,"1013 MAIN\nKEOKUK 52632\n(40.39978, -91.387531)",56.0,Lee,,,255,Wilson Daniels Ltd.,297,Templeton Rye w/Flask,6,750.0,$18.09,$27.14,6,$162.84,4.5,1.19
1,S29195400002,11/21/2015,2205,Ding's Honk And Holler,900 E WASHINGTON,CLARINDA,51632,"900 E WASHINGTON\nCLARINDA 51632\n(40.739238, ...",73.0,Page,,,255,Wilson Daniels Ltd.,297,Templeton Rye w/Flask,6,750.0,$18.09,$27.14,12,$325.68,9.0,2.38
2,S29050300001,11/16/2015,3549,Quicker Liquor Store,1414 48TH ST,FORT MADISON,52627,"1414 48TH ST\nFORT MADISON 52627\n(40.624226, ...",56.0,Lee,,,130,Disaronno International LLC,249,Disaronno Amaretto Cavalli Mignon 3-50ml Pack,20,150.0,$6.40,$9.60,2,$19.20,0.3,0.08
3,S28867700001,11/04/2015,2513,Hy-Vee Food Store #2 / Iowa City,812 S 1ST AVE,IOWA CITY,52240,812 S 1ST AVE\nIOWA CITY 52240\n,52.0,Johnson,,,65,Jim Beam Brands,237,Knob Creek w/ Crystal Decanter,3,1750.0,$35.55,$53.34,3,$160.02,5.25,1.39
4,S29050800001,11/17/2015,3942,Twin Town Liquor,104 HIGHWAY 30 WEST,TOLEDO,52342,"104 HIGHWAY 30 WEST\nTOLEDO 52342\n(41.985887,...",86.0,Tama,,,130,Disaronno International LLC,249,Disaronno Amaretto Cavalli Mignon 3-50ml Pack,20,150.0,$6.40,$9.60,2,$19.20,0.3,0.08
5,S28869200001,11/11/2015,3650,"Spirits, Stogies and Stuff",118 South Main St.,HOLSTEIN,51025,118 South Main St.\nHOLSTEIN 51025\n(42.490073...,47.0,Ida,,,65,Jim Beam Brands,237,Knob Creek w/ Crystal Decanter,3,1750.0,$35.55,$53.34,1,$53.34,1.75,0.46
6,S28865700001,11/09/2015,2538,Hy-Vee Food Store #3 / Waterloo,1422 FLAMMANG DR,WATERLOO,50702,"1422 FLAMMANG DR\nWATERLOO 50702\n(42.459938, ...",7.0,Black Hawk,1701100.0,DECANTERS & SPECIALTY PACKAGES,962,Duggan's Distillers Products Corp,238,Forbidden Secret Coffee Pack,6,1500.0,$11.62,$17.43,6,$104.58,9.0,2.38
7,S28869500001,11/10/2015,3942,Twin Town Liquor,104 HIGHWAY 30 WEST,TOLEDO,52342,"104 HIGHWAY 30 WEST\nTOLEDO 52342\n(41.985887,...",86.0,Tama,,,65,Jim Beam Brands,237,Knob Creek w/ Crystal Decanter,3,1750.0,$35.55,$53.34,2,$106.68,3.5,0.92
8,S29339300091,11/30/2015,2662,Hy-Vee Wine & Spirits / Muscatine,"522 MULBERRY, SUITE A",MUSCATINE,52761,"522 MULBERRY, SUITE A\nMUSCATINE 52761\n",70.0,Muscatine,1701100.0,DECANTERS & SPECIALTY PACKAGES,65,Jim Beam Brands,173,Laphroaig w/ Whiskey Stones,12,750.0,$19.58,$29.37,4,$117.48,3.0,0.79
9,S29050900001,11/16/2015,4307,Crossroads Wine and Liquor,117 IOWA AVE,DUNLAP,712-2,"117 IOWA AVE\nDUNLAP 712-2\n(41.854728, -95.60...",43.0,Harrison,,,130,Disaronno International LLC,249,Disaronno Amaretto Cavalli Mignon 3-50ml Pack,20,150.0,$6.40,$9.60,2,$19.20,0.3,0.08


In [137]:
#Its time to setup our forward feeding file cleanup method...

#Signature: LineTuple[Tuple], TupleDict[List], {Input to clipappend} -> NoneType
#Purpose: pass in the last line number, and a range of lines to cut out. Also pass in a list to modify.
#Call clip append. The output of clipappend is appended to the list, and mutation changes are maintained after the 
#function ends.
def forwardclean(startLine, cutRows, tupleList, pathTo, fileName):
    tupleList.append(clipappend(pathTo,fileName, startLine, cutRows))
    return 

tupleList = []


In [138]:
#Quick Test of the method:
forwardclean(100,10,tupleList,"./data/originalcsv","splitfile.csv")
tupleList

[(69, 102)]

In [139]:
forwardclean(100,10,tupleList,"./data/originalcsv","splitfile.csv")
tupleList

[(69, 102), (69, 102)]

In [140]:
forwardclean(100,10,tupleList,"./data/originalcsv","splitfile.csv")
tupleList

[(69, 102), (69, 102), (69, 102)]