In [1]:
#Imports:
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt
import time


In [23]:
#Lets make our console outputs more nice, by applying some settings.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
alt.renderers.enable('notebook')
alt.data_transformers.enable('default', max_rows=None)
%matplotlib inline 
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 1000)



RendererRegistry.enable('notebook')

DataTransformerRegistry.enable('default')

First, let's load the data in. I avoid the reduced dataset as it is missing 6 of the 24 original rows. This dataset has a list of transactions from all over Iowa. What analytics can we produce?

In [31]:
typeDict = {"Invoice/Item Number":"object"
,"Date":"object"
,"Store Number":"int64"
,"Store Name":"object"
,"Address":"object"
,"City":"object"
,"Zip Code":"object"
,"Store Location":"object"
,"County Number":"float64"
,"County":"object"
,"Category":"float64"
,"Category Name":"object"
,"Vendor Number":"int64"
,"Vendor Name":"object"
,"Item Number":"int64"
,"Item Description":"object"
,"Pack":"int64"
,"Bottle Volume (ml)":"float64"
,"State Bottle Cost":"object"
,"State Bottle Retail":"object"
,"Bottles Sold":"int64"
,"Sale (Dollars)":"object"
,"Volume Sold (Liters)":"float64"
,"Volume Sold (Gallons)":"float64"}

df = pd.read_csv("./data/originalcsv/splitfile.csv", dtype=typeDict) #,,low_memory=False)
df.head(10)

Unnamed: 0,Invoice/Item Number,Date,Store Number,Store Name,Address,City,Zip Code,Store Location,County Number,County,Category,Category Name,Vendor Number,Vendor Name,Item Number,Item Description,Pack,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,S29198800001,11/20/2015,2191,Keokuk Spirits,1013 MAIN,KEOKUK,52632,"1013 MAIN\nKEOKUK 52632\n(40.39978, -91.387531)",56.0,Lee,,,255,Wilson Daniels Ltd.,297,Templeton Rye w/Flask,6,750.0,$18.09,$27.14,6,$162.84,4.5,1.19
1,S29195400002,11/21/2015,2205,Ding's Honk And Holler,900 E WASHINGTON,CLARINDA,51632,"900 E WASHINGTON\nCLARINDA 51632\n(40.739238, ...",73.0,Page,,,255,Wilson Daniels Ltd.,297,Templeton Rye w/Flask,6,750.0,$18.09,$27.14,12,$325.68,9.0,2.38
2,S29050300001,11/16/2015,3549,Quicker Liquor Store,1414 48TH ST,FORT MADISON,52627,"1414 48TH ST\nFORT MADISON 52627\n(40.624226, ...",56.0,Lee,,,130,Disaronno International LLC,249,Disaronno Amaretto Cavalli Mignon 3-50ml Pack,20,150.0,$6.40,$9.60,2,$19.20,0.3,0.08
3,S28867700001,11/04/2015,2513,Hy-Vee Food Store #2 / Iowa City,812 S 1ST AVE,IOWA CITY,52240,812 S 1ST AVE\nIOWA CITY 52240\n,52.0,Johnson,,,65,Jim Beam Brands,237,Knob Creek w/ Crystal Decanter,3,1750.0,$35.55,$53.34,3,$160.02,5.25,1.39
4,S29050800001,11/17/2015,3942,Twin Town Liquor,104 HIGHWAY 30 WEST,TOLEDO,52342,"104 HIGHWAY 30 WEST\nTOLEDO 52342\n(41.985887,...",86.0,Tama,,,130,Disaronno International LLC,249,Disaronno Amaretto Cavalli Mignon 3-50ml Pack,20,150.0,$6.40,$9.60,2,$19.20,0.3,0.08
5,S28869200001,11/11/2015,3650,"Spirits, Stogies and Stuff",118 South Main St.,HOLSTEIN,51025,118 South Main St.\nHOLSTEIN 51025\n(42.490073...,47.0,Ida,,,65,Jim Beam Brands,237,Knob Creek w/ Crystal Decanter,3,1750.0,$35.55,$53.34,1,$53.34,1.75,0.46
6,S28865700001,11/09/2015,2538,Hy-Vee Food Store #3 / Waterloo,1422 FLAMMANG DR,WATERLOO,50702,"1422 FLAMMANG DR\nWATERLOO 50702\n(42.459938, ...",7.0,Black Hawk,1701100.0,DECANTERS & SPECIALTY PACKAGES,962,Duggan's Distillers Products Corp,238,Forbidden Secret Coffee Pack,6,1500.0,$11.62,$17.43,6,$104.58,9.0,2.38
7,S28869500001,11/10/2015,3942,Twin Town Liquor,104 HIGHWAY 30 WEST,TOLEDO,52342,"104 HIGHWAY 30 WEST\nTOLEDO 52342\n(41.985887,...",86.0,Tama,,,65,Jim Beam Brands,237,Knob Creek w/ Crystal Decanter,3,1750.0,$35.55,$53.34,2,$106.68,3.5,0.92
8,S29339300091,11/30/2015,2662,Hy-Vee Wine & Spirits / Muscatine,"522 MULBERRY, SUITE A",MUSCATINE,52761,"522 MULBERRY, SUITE A\nMUSCATINE 52761\n",70.0,Muscatine,1701100.0,DECANTERS & SPECIALTY PACKAGES,65,Jim Beam Brands,173,Laphroaig w/ Whiskey Stones,12,750.0,$19.58,$29.37,4,$117.48,3.0,0.79
9,S29050900001,11/16/2015,4307,Crossroads Wine and Liquor,117 IOWA AVE,DUNLAP,712-2,"117 IOWA AVE\nDUNLAP 712-2\n(41.854728, -95.60...",43.0,Harrison,,,130,Disaronno International LLC,249,Disaronno Amaretto Cavalli Mignon 3-50ml Pack,20,150.0,$6.40,$9.60,2,$19.20,0.3,0.08


So it loads OK. Lets get some basic data:

In [21]:
df.shape
df.info()
df.describe() #For numeric columns
df.isnull().sum()

(79998, 24)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79998 entries, 0 to 79997
Data columns (total 24 columns):
Invoice/Item Number      79998 non-null object
Date                     79998 non-null object
Store Number             79998 non-null int64
Store Name               79998 non-null object
Address                  79998 non-null object
City                     79998 non-null object
Zip Code                 79998 non-null object
Store Location           79998 non-null object
County Number            79894 non-null float64
County                   79894 non-null object
Category                 79896 non-null float64
Category Name            79874 non-null object
Vendor Number            79998 non-null int64
Vendor Name              79998 non-null object
Item Number              79998 non-null int64
Item Description         79998 non-null object
Pack                     79998 non-null float64
Bottle Volume (ml)       79998 non-null int64
State Bottle Cost        79998 non-null object

Unnamed: 0,Store Number,County Number,Category,Vendor Number,Item Number,Pack,Bottle Volume (ml),Bottles Sold,Volume Sold (Liters),Volume Sold (Gallons)
count,79998.0,79894.0,79896.0,79998.0,79998.0,79998.0,79998.0,79998.0,79998.0,79998.0
mean,3489.160842,57.027286,1044543.0,256.628653,45747.753606,12.160379,932.444924,9.703818,8.935812,2.360756
std,865.324238,27.244974,50398.75,143.166204,50836.477788,7.455412,478.295439,23.035683,26.982981,7.128135
min,2106.0,1.0,1011100.0,10.0,173.0,1.0,50.0,1.0,0.1,0.03
25%,2602.0,31.0,1012210.0,115.0,27102.0,6.0,750.0,3.0,1.75,0.46
50%,3650.0,61.0,1032080.0,260.0,38176.0,12.0,750.0,6.0,5.62,1.49
75%,4191.0,77.0,1062310.0,380.0,62097.0,12.0,1000.0,12.0,10.5,2.77
max,9018.0,99.0,1701100.0,978.0,993880.0,336.0,6000.0,2328.0,2328.0,614.99


Invoice/Item Number        0
Date                       0
Store Number               0
Store Name                 0
Address                    0
City                       0
Zip Code                   0
Store Location             0
County Number            104
County                   104
Category                 102
Category Name            124
Vendor Number              0
Vendor Name                0
Item Number                0
Item Description           0
Pack                       0
Bottle Volume (ml)         0
State Bottle Cost          0
State Bottle Retail        0
Bottles Sold               0
Sale (Dollars)             0
Volume Sold (Liters)       0
Volume Sold (Gallons)      0
dtype: int64

There is a lot of redundant data in our dataframe, via inspection. Lets drop redundant or not useful columns, and 
also dump any row that contains an NA value - we lose less than 1 percent of the data doing this.

In [33]:
dropList = ["Pack","Volume Sold (Gallons)","Vendor Name","Vendor Number","Address","Store Name"]
df.drop(columns=dropList, axis=1,inplace=True)
df.dropna(inplace=True)
df.shape #confirmation that it worked.

(79770, 18)

Next, lets do some data cleaning: 
- The column names need to be tidied. 
- some columns are too broadly typed (float instead of integer)
- Also, we can separate the GPS coordinate from the Store Location into latitude and longitude columns. 
- drop the Store Locaiton after GPS has been extracted.

In [36]:
nameDict = {"Invoice/Item Number":"invoicenumber"
,"Date":"date"
,"Store Number":"storenumber"
,"Store Name":"storename"
,"City":"city"
,"Zip Code":"zipcode"
,"Store Location":"storelocation"
,"County Number":"countynumber"
,"County":"county"
,"Category":"category"
,"Category Name":"categoryname"
,"Item Number":"itemnumber"
,"Item Description":"itemdescription"
,"Bottle Volume (ml)":"bottlevolumeml"
,"State Bottle Cost":"statebottlecost"
,"State Bottle Retail":"statebottleretail"
,"Bottles Sold":"bottlessold"
,"Sale (Dollars)":"saleprice"
,"Volume Sold (Liters)":"volumesoldlitre"}

df.rename(columns = nameDict,inplace=True)

In [39]:
#Basic type changes, and extractions for a set of columns.
df = df.astype({"countynumber": int},inplace=True)
df['countynumber'].head(5)

In [47]:
#Doing some basic checks first
import re

df[df['storelocation'] == ""].shape #we know that there are no empty cols

#basic extraction, and its results:
#cant assume that the gps coordinates will always be in the 3rd position of a split :(
#Trick: I needed an extra pair of parentheses to get the match to work.
myregex = r"(\(.+,.+\))"
for item in list(df['storelocation'].head(20)):
    hold = re.findall(myregex,item)
    if (hold):
        print(hold)


(0, 18)

['(42.459938, -92.327917)']
['(42.490073, -95.544793)']
['(42.025841, -96.095845)']
['(41.397023, -92.899722)']
['(43.29355, -94.218)']
['(41.699173, -93.035654)']
['(41.586319, -93.664182)']
['(41.325428, -93.109494)']
['(41.676203, -91.518536)']
['(41.45135, -91.035137)']
['(41.557404, -95.899334)']
['(41.45135, -91.035137)']
['(41.598514, -93.808855)']
['(41.325428, -93.109494)']
['(41.538207, -90.611972)']
['(40.809556, -91.141395)']


In [28]:
#first, lets extract the GPS coordinates:
#We can't assume every column is a string, or has GPS coordinates
def getlatlong(cell, pos):
    if cell = 
    section = cell.split("\n")[2]
    


latitude = df['Store Location'].apply(getLat,pos)







Index(['Invoice/Item Number', 'Date', 'Store Number', 'City', 'Zip Code', 'Store Location', 'County Number', 'County', 'Category', 'Category Name', 'Item Number', 'Item Description', 'Bottle Volume (ml)', 'State Bottle Cost', 'State Bottle Retail', 'Bottles Sold', 'Sale (Dollars)', 'Volume Sold (Liters)'], dtype='object')