In [35]:
#Imports:
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt



In [36]:
#Lets make our console outputs more nice, by applying some settings.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
alt.renderers.enable('notebook')
alt.data_transformers.enable('default', max_rows=None)
%matplotlib inline 
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 1000)

RendererRegistry.enable('notebook')

DataTransformerRegistry.enable('default')

First, let's load the data in. I avoid the reduced dataset as it is missing 6 of the 24 original rows. This dataset has a list of transactions from all over Iowa. What analytics can we produce?

In [58]:
typeDict = {"Invoice/Item Number":"object"
,"Date":"object"
,"Store Number":"int64"
,"Store Name":"object"
,"Address":"object"
,"City":"object"
,"Zip Code":"object"
,"Store Location":"object"
,"County Number":"float64"
,"County":"object"
,"Category":"float64"
,"Category Name":"object"
,"Vendor Number":"int64"
,"Vendor Name":"object"
,"Item Number":"int64"
,"Item Description":"object"
,"Pack":"int64"
,"Bottle Volume (ml)":"float64"
,"State Bottle Cost":"object"
,"State Bottle Retail":"object"
,"Bottles Sold":"int64"
,"Sale (Dollars)":"object"
,"Volume Sold (Liters)":"float64"
,"Volume Sold (Gallons)":"float64"}

df = pd.read_csv("./data/originalcsv/splitfile.csv", dtype=typeDict) #,,low_memory=False)
#df.head(10)

So it loads OK. Lets get some basic data:

In [38]:
df.shape
df.info()
df.describe() #For numeric columns
df.isnull().sum()

(79998, 24)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79998 entries, 0 to 79997
Data columns (total 24 columns):
Invoice/Item Number      79998 non-null object
Date                     79998 non-null object
Store Number             79998 non-null int64
Store Name               79998 non-null object
Address                  79998 non-null object
City                     79998 non-null object
Zip Code                 79998 non-null object
Store Location           79998 non-null object
County Number            79894 non-null float64
County                   79894 non-null object
Category                 79896 non-null float64
Category Name            79874 non-null object
Vendor Number            79998 non-null int64
Vendor Name              79998 non-null object
Item Number              79998 non-null int64
Item Description         79998 non-null object
Pack                     79998 non-null int64
Bottle Volume (ml)       79998 non-null float64
State Bottle Cost        79998 non-null object

Unnamed: 0,Store Number,County Number,Category,Vendor Number,Item Number,Pack,Bottle Volume (ml),Bottles Sold,Volume Sold (Liters),Volume Sold (Gallons)
count,79998.0,79894.0,79896.0,79998.0,79998.0,79998.0,79998.0,79998.0,79998.0,79998.0
mean,3489.160842,57.027286,1044543.0,256.628653,45747.753606,12.160379,932.444924,9.703818,8.935812,2.360756
std,865.324238,27.244974,50398.75,143.166204,50836.477788,7.455412,478.295439,23.035683,26.982981,7.128135
min,2106.0,1.0,1011100.0,10.0,173.0,1.0,50.0,1.0,0.1,0.03
25%,2602.0,31.0,1012210.0,115.0,27102.0,6.0,750.0,3.0,1.75,0.46
50%,3650.0,61.0,1032080.0,260.0,38176.0,12.0,750.0,6.0,5.62,1.49
75%,4191.0,77.0,1062310.0,380.0,62097.0,12.0,1000.0,12.0,10.5,2.77
max,9018.0,99.0,1701100.0,978.0,993880.0,336.0,6000.0,2328.0,2328.0,614.99


Invoice/Item Number        0
Date                       0
Store Number               0
Store Name                 0
Address                    0
City                       0
Zip Code                   0
Store Location             0
County Number            104
County                   104
Category                 102
Category Name            124
Vendor Number              0
Vendor Name                0
Item Number                0
Item Description           0
Pack                       0
Bottle Volume (ml)         0
State Bottle Cost          0
State Bottle Retail        0
Bottles Sold               0
Sale (Dollars)             0
Volume Sold (Liters)       0
Volume Sold (Gallons)      0
dtype: int64

There is a lot of redundant data in our dataframe, via inspection. Lets drop useless columns and 
also dump any row that contains an NA value - we lose less than 1 percent of the data doing this (row-wise).

In [59]:
dropList = ["Pack","Volume Sold (Gallons)","Vendor Name","Vendor Number","Address","Store Name"]
df.drop(columns=dropList, axis=1,inplace=True)
df.dropna(inplace=True)
df.shape #confirmation that it worked.

(79770, 18)

Next, lets do some data cleaning: 
- The column names need to be tidied. 
- some columns are too broadly typed (float instead of integer)
- Also, we can separate the GPS coordinate from the Store Location into latitude and longitude columns. 
- drop the Store Locaiton after GPS has been extracted.

In [60]:
nameDict = {"Invoice/Item Number":"invoicenumber"
,"Date":"date"
,"Store Number":"storenumber"
,"Store Name":"storename"
,"City":"city"
,"Zip Code":"zipcode"
,"Store Location":"storelocation"
,"County Number":"countynumber"
,"County":"county"
,"Category":"category"
,"Category Name":"categoryname"
,"Item Number":"itemnumber"
,"Item Description":"itemdescription"
,"Bottle Volume (ml)":"bottlevolumeml"
,"State Bottle Cost":"statebottlecost"
,"State Bottle Retail":"statebottleretail"
,"Bottles Sold":"bottlessold"
,"Sale (Dollars)":"saleprice"
,"Volume Sold (Liters)":"volumesoldlitre"}

df.rename(columns = nameDict,inplace=True)

In [72]:
#Basic type changes, and extractions for a set of columns.
df = df.astype({"countynumber": int},inplace=True)
df = df.astype({"category": int}, inplace=True)

In [62]:
#lets clean up all the price columns. We need string -> float.
def cleanup(cell):
    return float(cell.replace("$",""))

for columnname in ["statebottlecost","statebottleretail","saleprice"]:
    df[columnname] = df[columnname].apply(cleanup)
    df.astype({columnname:float})

#df.head(5)

In [63]:
#Doing some basic checks first
#first, we replace the Store Location with just the GPS coordinate.
import re

df[df['storelocation'] == ""].shape #we know that there are no empty store locations.

#basic extraction, and its results:
#cant assume that the gps coordinates will always be in the 3rd position of a split :(
#Trick: I needed an extra pair of parentheses to get the match to work.
myregex = r"(\(.+,.+\))"

def cutgps(cell,myregex):
    store = re.findall(myregex,cell)
    if (store): #empty
         retVal = store[0]
    else:
         retVal = "NA" #Some of the entries are missing their lat/long data
    return retVal
    
df['storelocation'] = df['storelocation'].apply(cutgps,args=(myregex,))
#df['storelocation'].tail(10)


(0, 18)

In [64]:
df.head(10)

Unnamed: 0,invoicenumber,date,storenumber,city,zipcode,storelocation,countynumber,county,category,categoryname,itemnumber,itemdescription,bottlevolumeml,statebottlecost,statebottleretail,bottlessold,saleprice,volumesoldlitre
6,S28865700001,11/09/2015,2538,WATERLOO,50702,"(42.459938, -92.327917)",7,Black Hawk,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,6,104.58,9.0
8,S29339300091,11/30/2015,2662,MUSCATINE,52761,,70,Muscatine,1701100.0,DECANTERS & SPECIALTY PACKAGES,173,Laphroaig w/ Whiskey Stones,750.0,19.58,29.37,4,117.48,3.0
13,S28866900001,11/11/2015,3650,HOLSTEIN,51025,"(42.490073, -95.544793)",47,Ida,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,1,17.43,1.5
18,S29134300126,11/18/2015,3723,ONAWA,51040,"(42.025841, -96.095845)",67,Monona,1081200.0,CREAM LIQUEURS,258,"Rumchata ""GoChatas""",6000.0,99.0,148.5,1,148.5,6.0
21,S29282800048,11/23/2015,2642,PELLA,50219,"(41.397023, -92.899722)",63,Marion,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,6,104.58,9.0
25,S28867000001,11/04/2015,3842,BANCROFT,50517,"(43.29355, -94.218)",55,Kossuth,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,3,52.29,4.5
29,S28865800001,11/09/2015,2539,IOWA FALLS,50126,,42,Hardin,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,6,104.58,9.0
38,S28867100001,11/09/2015,4604,NEWTON,50208,"(41.699173, -93.035654)",50,Jasper,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,2,34.86,3.0
42,S29191200001,11/19/2015,2248,DES MOINES,50312,"(41.586319, -93.664182)",77,Polk,1701100.0,DECANTERS & SPECIALTY PACKAGES,173,Laphroaig w/ Whiskey Stones,750.0,19.58,29.37,36,1057.32,27.0
50,S29137200001,11/18/2015,2566,KNOXVILLE,50138,"(41.325428, -93.109494)",63,Marion,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,12,209.16,18.0


In [65]:
#first, lets extract the GPS coordinates:
#What can we assume?
#each column is a non-empty string, at least.

def getlatlong(cell, pos):
    if (cell == "NA"):
        return cell
    sectionList = cell.replace("(","").replace(")","").split(",")
    return float(sectionList[pos]) #0 or 1. Float will clip out spaces for us!
 
#df['storelocation'].head(10).apply(getlatlong,args=(0,))    
    
df['latitude'] = df['storelocation'].apply(getlatlong,args=(0,))
df['longitude'] = df['storelocation'].apply(getlatlong,args=(1,))

df.head(5)

Unnamed: 0,invoicenumber,date,storenumber,city,zipcode,storelocation,countynumber,county,category,categoryname,itemnumber,itemdescription,bottlevolumeml,statebottlecost,statebottleretail,bottlessold,saleprice,volumesoldlitre,latitude,longitude
6,S28865700001,11/09/2015,2538,WATERLOO,50702,"(42.459938, -92.327917)",7,Black Hawk,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,6,104.58,9.0,42.4599,-92.3279
8,S29339300091,11/30/2015,2662,MUSCATINE,52761,,70,Muscatine,1701100.0,DECANTERS & SPECIALTY PACKAGES,173,Laphroaig w/ Whiskey Stones,750.0,19.58,29.37,4,117.48,3.0,,
13,S28866900001,11/11/2015,3650,HOLSTEIN,51025,"(42.490073, -95.544793)",47,Ida,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,1,17.43,1.5,42.4901,-95.5448
18,S29134300126,11/18/2015,3723,ONAWA,51040,"(42.025841, -96.095845)",67,Monona,1081200.0,CREAM LIQUEURS,258,"Rumchata ""GoChatas""",6000.0,99.0,148.5,1,148.5,6.0,42.0258,-96.0958
21,S29282800048,11/23/2015,2642,PELLA,50219,"(41.397023, -92.899722)",63,Marion,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,6,104.58,9.0,41.397,-92.8997


In [67]:
#finally, destroy the storelocation column, and delete any other rows that are NA (specifically, lat or long)
df.drop(columns=["storelocation"], axis=1,inplace=True)

KeyError: "['storelocation'] not found in axis"

## Finally, we have nice clean data \o/ !

Lets write it to a file, and take a look at it in tableau!

In [68]:
df.head(10)

Unnamed: 0,invoicenumber,date,storenumber,city,zipcode,countynumber,county,category,categoryname,itemnumber,itemdescription,bottlevolumeml,statebottlecost,statebottleretail,bottlessold,saleprice,volumesoldlitre,latitude,longitude
6,S28865700001,11/09/2015,2538,WATERLOO,50702,7,Black Hawk,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,6,104.58,9.0,42.4599,-92.3279
8,S29339300091,11/30/2015,2662,MUSCATINE,52761,70,Muscatine,1701100.0,DECANTERS & SPECIALTY PACKAGES,173,Laphroaig w/ Whiskey Stones,750.0,19.58,29.37,4,117.48,3.0,,
13,S28866900001,11/11/2015,3650,HOLSTEIN,51025,47,Ida,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,1,17.43,1.5,42.4901,-95.5448
18,S29134300126,11/18/2015,3723,ONAWA,51040,67,Monona,1081200.0,CREAM LIQUEURS,258,"Rumchata ""GoChatas""",6000.0,99.0,148.5,1,148.5,6.0,42.0258,-96.0958
21,S29282800048,11/23/2015,2642,PELLA,50219,63,Marion,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,6,104.58,9.0,41.397,-92.8997
25,S28867000001,11/04/2015,3842,BANCROFT,50517,55,Kossuth,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,3,52.29,4.5,43.2936,-94.218
29,S28865800001,11/09/2015,2539,IOWA FALLS,50126,42,Hardin,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,6,104.58,9.0,,
38,S28867100001,11/09/2015,4604,NEWTON,50208,50,Jasper,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,2,34.86,3.0,41.6992,-93.0357
42,S29191200001,11/19/2015,2248,DES MOINES,50312,77,Polk,1701100.0,DECANTERS & SPECIALTY PACKAGES,173,Laphroaig w/ Whiskey Stones,750.0,19.58,29.37,36,1057.32,27.0,41.5863,-93.6642
50,S29137200001,11/18/2015,2566,KNOXVILLE,50138,63,Marion,1701100.0,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,12,209.16,18.0,41.3254,-93.1095


After inserting the spreadsheet into tableau, I realized that there are some issues. First, tableau doesn't just accept GPS coordinates, it won't easily associate them with other data. To get a generated map of regions, you need to have a Hiearchy of named places. Here: Country (US), State: Iowa, then County, then City. So we need to generate
dummy rows filled with these values, to get tableau to parse each point correctly.

In [75]:
df['country'] = "US"
df['state'] = "Iowa"
df.head(3)

Unnamed: 0,invoicenumber,date,storenumber,city,zipcode,countynumber,county,category,categoryname,itemnumber,itemdescription,bottlevolumeml,statebottlecost,statebottleretail,bottlessold,saleprice,volumesoldlitre,latitude,longitude,country,state
6,S28865700001,11/09/2015,2538,WATERLOO,50702,7,Black Hawk,1701100,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,6,104.58,9.0,42.4599,-92.3279,US,Iowa
8,S29339300091,11/30/2015,2662,MUSCATINE,52761,70,Muscatine,1701100,DECANTERS & SPECIALTY PACKAGES,173,Laphroaig w/ Whiskey Stones,750.0,19.58,29.37,4,117.48,3.0,,,US,Iowa
13,S28866900001,11/11/2015,3650,HOLSTEIN,51025,47,Ida,1701100,DECANTERS & SPECIALTY PACKAGES,238,Forbidden Secret Coffee Pack,1500.0,11.62,17.43,1,17.43,1.5,42.4901,-95.5448,US,Iowa


In [76]:
df.to_csv("./data/tidy_reduced_data.csv",header=True,sep=",",index=False,encoding="utf-8")

In [2]:
import pandas as pd
testDF = pd.DataFrame({'a':['809809.2','809809.2','809809.2']})

In [None]:
testDF[]