# Getting started

Once you've chosen your scenario, download the data from [the Iowa website](https://data.iowa.gov/Economy/Iowa-Liquor-Sales/m3tr-qhgy) in csv format. Start by loading the data with pandas. You may need to parse the date columns appropriately.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline

from sklearn import datasets
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

### Collecting Data

In [2]:
# Named csv 'Iowa"
Iowa = pd.read_csv('/Users/macbook/GA-DSI/projects/projects-weekly/project-03/Iowa_Liquor_sales_sample_10pct.csv')

In [3]:
# Observe Data 
Iowa.head()


Unnamed: 0,Date,Store Number,City,Zip Code,County Number,County,Category,Category Name,Vendor Number,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,11/04/2015,3717,SUMNER,50674,9.0,Bremer,1051100.0,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,$4.50,$6.75,12,$81.00,9.0,2.38
1,03/02/2016,2614,DAVENPORT,52807,82.0,Scott,1011100.0,BLENDED WHISKIES,395,27605,Tin Cup,750,$13.75,$20.63,2,$41.26,1.5,0.4
2,02/11/2016,2106,CEDAR FALLS,50613,7.0,Black Hawk,1011200.0,STRAIGHT BOURBON WHISKIES,65,19067,Jim Beam,1000,$12.59,$18.89,24,$453.36,24.0,6.34
3,02/03/2016,2501,AMES,50010,85.0,Story,1071100.0,AMERICAN COCKTAILS,395,59154,1800 Ultimate Margarita,1750,$9.50,$14.25,6,$85.50,10.5,2.77
4,08/18/2015,3654,BELMOND,50421,99.0,Wright,1031080.0,VODKA 80 PROOF,297,35918,Five O'clock Vodka,1750,$7.20,$10.80,12,$129.60,21.0,5.55


### Fixing types, NaN values, Cleaning names

In [12]:
# Remove all NaN values

Iowa = Iowa.dropna()

In [9]:
# Checking types
Iowa.dtypes

date                    object
store_number             int64
city                    object
zip_code                object
county_number          float64
county                  object
category               float64
category_name           object
vendor_number            int64
item_number              int64
item_description        object
bottle_volume            int64
state_bottle_cost       object
state_bottle_retail     object
bottles_sold             int64
sales                   object
volume_sold            float64
volume_sold_gallons    float64
dtype: object

In [6]:
# Clean up column names 1: remove units of measurement
Iowa.rename(columns = {'Bottle Volume (ml)': 'Bottle Volume', 'Sale (Dollars)': 'Sales', \
                      'Volume Sold (Liters)': 'Volume Sold', \
                       'Volume Sold (Gallons)': 'Volume Sold Gallons'}, inplace=True)

# Clean up column names 2: convert all to lowercase letters
import string 
lower = string.ascii_lowercase
Iowa.rename(columns = lambda x: x.lower(), inplace=True)

# Clean up column names 3: replace '', '(' and ')'
Iowa.rename(columns = lambda x: x.replace(' ', '_'), inplace=True)
Iowa.rename(columns = lambda x: x.replace('(', ''), inplace=True)
Iowa.rename(columns = lambda x: x.replace(')', ''), inplace=True)

In [8]:
# Change other object types
Iowa['city'] = Iowa['city'].astype(str)
Iowa['county'] = Iowa['county'].astype(str)
Iowa['category_name'] = Iowa['category_name'].astype(str)
Iowa['item_description'] = Iowa['item_description'].astype(str)

In [10]:
#Change Date to datetime
Iowa['date'] = pd.to_datetime(Iowa['date'])

In [13]:
# Force zip codes to floats, change to ints
Iowa['zip_code'] = pd.to_numeric(Iowa['zip_code'], errors = 'coerce')
Iowa['zip_code'] = Iowa.loc[:, ['zip_code']].astype(int)

In [14]:
# Change County Number to int
Iowa['county_number'] = Iowa.loc[:, ['county_number']].astype(int)

In [15]:
Iowa.head()

Unnamed: 0,date,store_number,city,zip_code,county_number,county,category,category_name,vendor_number,item_number,item_description,bottle_volume,state_bottle_cost,state_bottle_retail,bottles_sold,sales,volume_sold,volume_sold_gallons
0,2015-11-04,3717,SUMNER,50674,9,Bremer,1051100.0,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,$4.50,$6.75,12,$81.00,9.0,2.38
1,2016-03-02,2614,DAVENPORT,52807,82,Scott,1011100.0,BLENDED WHISKIES,395,27605,Tin Cup,750,$13.75,$20.63,2,$41.26,1.5,0.4
2,2016-02-11,2106,CEDAR FALLS,50613,7,Black Hawk,1011200.0,STRAIGHT BOURBON WHISKIES,65,19067,Jim Beam,1000,$12.59,$18.89,24,$453.36,24.0,6.34
3,2016-02-03,2501,AMES,50010,85,Story,1071100.0,AMERICAN COCKTAILS,395,59154,1800 Ultimate Margarita,1750,$9.50,$14.25,6,$85.50,10.5,2.77
4,2015-08-18,3654,BELMOND,50421,99,Wright,1031080.0,VODKA 80 PROOF,297,35918,Five O'clock Vodka,1750,$7.20,$10.80,12,$129.60,21.0,5.55


In [16]:
# Checking for NaNs
Iowa.isnull().sum()

date                   0
store_number           0
city                   0
zip_code               0
county_number          0
county                 0
category               0
category_name          0
vendor_number          0
item_number            0
item_description       0
bottle_volume          0
state_bottle_cost      0
state_bottle_retail    0
bottles_sold           0
sales                  0
volume_sold            0
volume_sold_gallons    0
dtype: int64

In [17]:
# Changing types of all dollar columns

Iowa['state_bottle_retail'] = Iowa['state_bottle_retail'].str.replace('$', '')
Iowa['state_bottle_cost'] = Iowa['state_bottle_cost'].str.replace('$', '')
Iowa['sales'] = Iowa['sales'].str.replace('$', '')

Iowa['state_bottle_retail'] = Iowa.loc[:, ['state_bottle_retail']].astype(float)
Iowa['state_bottle_cost'] = Iowa.loc[:, ['state_bottle_cost']].astype(float)
Iowa['sales'] = Iowa.loc[:, ['sales']].astype(float)

### Observing, Exploring Data. Creating useful columns, seperating data

In [18]:
# Create category table to observe
category_df = Iowa.groupby('category')[['category', 'category_name']]
desc_df = Iowa.groupby('item_number')[['item_number', 'item_description']]
county_df = Iowa.groupby('county_number')[['county_number', 'county']]

In [19]:
category_df.head(3)

Unnamed: 0,category,category_name
0,1051100.0,APRICOT BRANDIES
1,1011100.0,BLENDED WHISKIES
2,1011200.0,STRAIGHT BOURBON WHISKIES
3,1071100.0,AMERICAN COCKTAILS
4,1031080.0,VODKA 80 PROOF
5,1041100.0,AMERICAN DRY GINS
6,1051010.0,AMERICAN GRAPE BRANDIES
7,1012100.0,CANADIAN WHISKIES
8,1032080.0,IMPORTED VODKA
9,1081315.0,CINNAMON SCHNAPPS


In [20]:
# Create a Year column.

Iowa['year'] = Iowa['date'].map(lambda x: x.year)

In [37]:
# Create table with just the year 2015
store_sales_2015 = store_sales[store_sales['year'] == 2015]
store_sales_2015.head(3)

Unnamed: 0_level_0,store_number,city,year,volume_sold,volume_sold,profit_per_bottle,profit_per_bottle,sales,sales
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,sum,mean,sum,mean,sum,mean
0,2106,CEDAR FALLS,2015,9719.85,18.478802,48742.2,92.665779,146038.7,277.640114
2,2113,GOWRIE,2015,659.85,4.488776,3109.04,21.149932,9310.22,63.33483
4,2130,WATERLOO,2015,6879.37,17.594297,37229.32,95.215652,111583.91,285.380844


In [24]:
# Create year, quarter, and month
def extract_month(x):
    month = str('{:02d}'.format(x.month)) + "-" + str((x.year))
    return month
def extract_quarter(x):
    quarter = "Q" + str(x.quarter) + "-" + str(x.year)
    return quarter

Iowa["year"] = Iowa["date"].dt.year
Iowa["quarter"] = Iowa["date"].apply(extract_quarter)
Iowa["month"] = Iowa["date"].apply(extract_month)

### Remove Duplicate County Names & Fill in Missing Counties

In [25]:
# Match missing County with City
df_county = pd.pivot_table(Iowa, index=['city', 'county'], values=['sales'], aggfunc=sum)
df_county.drop('sales', axis=1, inplace=True)
df_county.reset_index(inplace=True)
df_county.head()

Unnamed: 0,city,county
0,ACKLEY,Hardin
1,ACKLEY,Webster
2,ADAIR,Adair
3,ADEL,Dallas
4,AFTON,Union


In [26]:
print df_county['city'].duplicated().sum()
a = df_county[df_county['city'].duplicated() == True].index.tolist()
df_county['city'].loc[a]

11


1               ACKLEY
33          BETTENDORF
66               CLIVE
74             CORNING
134         FORT DODGE
259             NEWTON
263      NORTH LIBERTY
276          OSKALOOSA
288              PERRY
336       STATE CENTER
379    WEST DES MOINES
Name: city, dtype: object

In [27]:
# Dropped incorrect city-county mappings
# Remove from iowa dataframe: index [32, 74, 134, 260, 263, 276, 288, 336]

b = [i+1 for i in a]
c = [i-1 for i in a]
d = a + b + c
e = sorted(d)

df_county.loc[e, :]
county_dict = {'bettendorf':'scott', 'corning':'adams', 'fort dodge': 'webster','newton':'jasper', \
               'north libery':'johnson', 'OSKALOOSA': 'mahaska', 'PERRY':'dallas', 'STATE CENTER': 'marshall'}

In [28]:
# Remove from iowa dataframe: index [32, 74, 134, 260, 263, 276, 288, 336]
df_county.drop([32, 74, 134, 260, 263, 276, 288, 336], inplace=True)

# Rename Ackley, Clive, and West Des Moines to account for parts of town in different counties
df_county.set_value([1,66,380], 'city', ['ACKLEY_WEBSTER', 'CLIVE_POLK', 'WEST_DES_MOINES_POLK'])

Unnamed: 0,city,county
0,ACKLEY,Hardin
1,ACKLEY_WEBSTER,Webster
2,ADAIR,Adair
3,ADEL,Dallas
4,AFTON,Union
5,AKRON,Plymouth
6,ALBIA,Monroe
7,ALDEN,Hardin
8,ALGONA,Kossuth
9,ALLISON,Butler


In [29]:
# Merge df_county into original iowa df
# Note: Original "county" renamed "county_x" and new "county_y" col created
Iowa = pd.merge(Iowa,df_county, on = 'city', how = 'left')
Iowa.head(3)

Unnamed: 0,date,store_number,city,zip_code,county_number,county_x,category,category_name,vendor_number,item_number,...,sales,volume_sold,volume_sold_gallons,year,profit_per_bottle,rev_per_ml,price_per_ml,quarter,month,county_y
0,2015-11-04,3717,SUMNER,50674,9,Bremer,1051100.0,APRICOT BRANDIES,55,54436,...,81.0,9.0,2.38,2015,27.0,9.0,0.75,Q4-2015,11-2015,Bremer
1,2016-03-02,2614,DAVENPORT,52807,82,Scott,1011100.0,BLENDED WHISKIES,395,27605,...,41.26,1.5,0.4,2016,13.76,27.506667,13.753333,Q1-2016,03-2016,Scott
2,2016-02-11,2106,CEDAR FALLS,50613,7,Black Hawk,1011200.0,STRAIGHT BOURBON WHISKIES,65,19067,...,453.36,24.0,6.34,2016,151.2,18.89,0.787083,Q1-2016,02-2016,Black Hawk


In [30]:
# Fill missing county names in "county_x" with county names from "county_y"
Iowa['county_x'].fillna(Iowa['county_y'], inplace=True)
Iowa.isnull().sum()

date                     0
store_number             0
city                     0
zip_code                 0
county_number            0
county_x                 0
category                 0
category_name            0
vendor_number            0
item_number              0
item_description         0
bottle_volume            0
state_bottle_cost        0
state_bottle_retail      0
bottles_sold             0
sales                    0
volume_sold              0
volume_sold_gallons      0
year                     0
profit_per_bottle        0
rev_per_ml               0
price_per_ml             0
quarter                  0
month                    0
county_y               316
dtype: int64

### Metrics and Analysis

In [31]:
# Checking to make that that the 'sales' columns represents total revenue
Iowa['sales_check'] = Iowa['state_bottle_retail'] * Iowa['bottles_sold']
print sum(Iowa['sales_check']-Iowa['sales'])
Iowa.drop('sales_check', axis=1, inplace=True)

2.37783126522e-11


In [39]:
# Create Pivot Table of store profit. Not what I am looking for but it gave me ideas
store_profit = pd.pivot_table(Iowa, values=['city'], index=['store_number', 'profit_per_bottle', 'year'], aggfunc=np.sum)
store_profit.reset_index(inplace=True)
store_profit.drop('city', axis=1, inplace=True)
store_profit.head()

Unnamed: 0,store_number,profit_per_bottle,year
0,2106,1.65,2015
1,2106,1.87,2015
2,2106,2.35,2016
3,2106,2.4,2015
4,2106,2.45,2015


In [38]:
#Create profit per bottle and total profit per category
Iowa["profit_per_bottle"] = (Iowa["state_bottle_retail"] - Iowa["state_bottle_cost"]) * Iowa["bottles_sold"]
Iowa['rev_per_ml'] = Iowa['sales'] / Iowa['volume_sold']
Iowa['price_per_ml'] = Iowa['state_bottle_retail'] / Iowa['volume_sold']
Iowa['profit_per_ml'] = Iowa['profit_per_bottle'] / Iowa['volume_sold']
Iowa['profit_%per_ml'] = Iowa['profit_per_ml'] / Iowa['rev_per_ml']
Iowa.head()

Unnamed: 0,date,store_number,city,zip_code,county_number,county_x,category,category_name,vendor_number,item_number,...,volume_sold_gallons,year,profit_per_bottle,rev_per_ml,price_per_ml,quarter,month,county_y,profit_per_ml,profit_%per_ml
0,2015-11-04,3717,SUMNER,50674,9,Bremer,1051100.0,APRICOT BRANDIES,55,54436,...,2.38,2015,27.0,9.0,0.75,Q4-2015,11-2015,Bremer,3.0,0.333333
1,2016-03-02,2614,DAVENPORT,52807,82,Scott,1011100.0,BLENDED WHISKIES,395,27605,...,0.4,2016,13.76,27.506667,13.753333,Q1-2016,03-2016,Scott,9.173333,0.333495
2,2016-02-11,2106,CEDAR FALLS,50613,7,Black Hawk,1011200.0,STRAIGHT BOURBON WHISKIES,65,19067,...,6.34,2016,151.2,18.89,0.787083,Q1-2016,02-2016,Black Hawk,6.3,0.33351
3,2016-02-03,2501,AMES,50010,85,Story,1071100.0,AMERICAN COCKTAILS,395,59154,...,2.77,2016,28.5,8.142857,1.357143,Q1-2016,02-2016,Story,2.714286,0.333333
4,2015-08-18,3654,BELMOND,50421,99,Wright,1031080.0,VODKA 80 PROOF,297,35918,...,5.55,2015,43.2,6.171429,0.514286,Q3-2015,08-2015,Wright,2.057143,0.333333


In [34]:
#Compute sales per store sum and mean
city_sales = Iowa.groupby(by=['store_number', 'city', 'year'], as_index=False)

store_sales = city_sales.agg({'sales':[np.sum, np.mean],
                             'volume_sold': [np.sum, np.mean],
                             'profit_per_bottle': [np.sum, np.mean]})
store_sales.head(3)

Unnamed: 0_level_0,store_number,city,year,volume_sold,volume_sold,profit_per_bottle,profit_per_bottle,sales,sales
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,sum,mean,sum,mean,sum,mean
0,2106,CEDAR FALLS,2015,9719.85,18.478802,48742.2,92.665779,146038.7,277.640114
1,2106,CEDAR FALLS,2016,2116.25,16.795635,10174.68,80.751429,30478.75,241.894841
2,2113,GOWRIE,2015,659.85,4.488776,3109.04,21.149932,9310.22,63.33483


In [33]:
# Create table of cities and stores
Iowa_cities = pd.pivot_table(Iowa, index=['city', 'year', 'store_number'], values=['profit_per_bottle'])
Iowa_cities.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,profit_per_bottle
city,year,store_number,Unnamed: 3_level_1
ACKLEY,2015,3591,25.517353
ACKLEY,2015,4415,26.83878
ACKLEY,2016,4415,17.511875


## Drop Pop-up stores

Because they are not open year round, the spike in sales will inappropriately skew the data

In [41]:
# Check for stores that were not open for the full year or closed during the year
dates_open = Iowa.groupby(['store_number'])['date'].agg([min, max])
dates_open.reset_index(inplace=True)
dates_open.tail(3)

Unnamed: 0,store_number,min,max
1374,9013,2015-06-04,2016-03-09
1375,9018,2015-10-27,2015-10-27
1376,9023,2016-03-08,2016-03-08


In [59]:
# Seperating pop-up stores
dates_open['open'] = dates_open['min'].apply(lambda x: 1 if x > pd.to_datetime('2015-03-31') else 0)
dates_open['closed'] = dates_open['max'].apply(lambda x: 1 if x <= pd.to_datetime('2015-12-31') else 0)
dates_open['partial'] = dates_open['closed'] + dates_open['open']

In [60]:
dates_open.tail()

Unnamed: 0,store_number,min,max,closed,open,partial
1372,9002,2015-01-15,2016-03-10,0,0,0
1373,9010,2015-01-20,2016-02-01,0,0,0
1374,9013,2015-06-04,2016-03-09,0,1,1
1375,9018,2015-10-27,2015-10-27,1,1,2
1376,9023,2016-03-08,2016-03-08,0,1,1


In [61]:
# List of 201 pop-up stores
partial_stores = list(dates_open[dates_open["partial"]!=0]["store_number"].values)
print 'Number of stores not open for full year 2015: ' + str(len(partial_stores))

open_stores = list(dates_open[dates_open['partial'] == 0]['store_number'].values)
print 'Total stores: '+ str(len(partial_stores) + len(open_stores))
print 'Total stores open all of 2015: ' + str(len(open_stores))


Number of stores not open for full year 2015: 201
Total stores: 1377
Total stores open all of 2015: 1176


In [62]:
# Amount of revenue generated by stores only partially open in 2015
partial_rev = sum(Iowa[Iowa['store_number'].isin(partial_stores)]['sales']) / sum(Iowa['sales']) 
print 'Only ' + str(round(partial_rev*100,2)) + '% of sales from stores partially open in 2015'
print 'Sales from these 201 stores will be dropped from analysis'

Only 3.79% of sales from stores partially open in 2015
Sales from these 201 stores will be dropped from analysis


In [64]:
# New df without pop-ups
full_iowa = Iowa[Iowa['store_number'].isin(open_stores)]
full_iowa.head(3)

Unnamed: 0,date,store_number,city,zip_code,county_number,county_x,category,category_name,vendor_number,item_number,...,volume_sold_gallons,year,profit_per_bottle,rev_per_ml,price_per_ml,quarter,month,county_y,profit_per_ml,profit_%per_ml
0,2015-11-04,3717,SUMNER,50674,9,Bremer,1051100.0,APRICOT BRANDIES,55,54436,...,2.38,2015,27.0,9.0,0.75,Q4-2015,11-2015,Bremer,3.0,0.333333
1,2016-03-02,2614,DAVENPORT,52807,82,Scott,1011100.0,BLENDED WHISKIES,395,27605,...,0.4,2016,13.76,27.506667,13.753333,Q1-2016,03-2016,Scott,9.173333,0.333495
2,2016-02-11,2106,CEDAR FALLS,50613,7,Black Hawk,1011200.0,STRAIGHT BOURBON WHISKIES,65,19067,...,6.34,2016,151.2,18.89,0.787083,Q1-2016,02-2016,Black Hawk,6.3,0.33351


### Pass through 2015 metrics on new df

In [None]:
sum_metrics = ['sales', 'profit']
iowa_sum_2015 = idf[idf['year'] == 2015].groupby(['store'])[sum_metrics].agg(np.sum)
iowa_sum_2015.columns = ['2015_total_revenue','2015_profit']
iowa_sum_2015.reset_index(inplace=True)
print len(iowa_sum_2015)
iowa_sum_2015.describe(include='all')

In [74]:
#Compute sales per store sum and mean. My partner showed me a better way to do this than previously
iowa_2015 = full_iowa[full_iowa['year'] == 2015]
sum_metrics = ['sales', 'profit_per_bottle']
iowa_2015 = full_iowa.groupby(by=['store_number'])[sum_metrics].agg(np.sum)
iowa_2015.columns = ['2015_revenue', '2015_profit']
iowa_2015.reset_index(inplace=True)
print len(iowa_2015)
iowa_2015.describe(include='all')



1176


Unnamed: 0,store_number,2015_revenue,2015_profit
count,1176.0,1176.0,1176.0
mean,4167.448129,29483.56,9857.503036
std,813.040577,69408.49,23177.662511
min,2106.0,641.01,214.14
25%,3807.5,4812.282,1610.025
50%,4379.5,10431.84,3499.735
75%,4742.25,27032.19,9055.7625
max,9010.0,1215399.0,405404.53


In [76]:
iowa_2015.dtypes

store_number      int64
2015_revenue    float64
2015_profit     float64
dtype: object

In [87]:
# Change types to int
iowa_2015['2015_revenue'] = iowa_2015['2015_revenue'].astype(int)
iowa_2015['2015_profit'] = iowa_2015['2015_profit'].astype(int)

In [89]:
# Create a df summarizing mean metrics for 2015 at store-level
mean_metrics = ['sales', 'profit_per_bottle', 'price_per_ml', 'profit_per_ml', 'profit_per_bottle']
iowa_mean_2015 = full_iowa[full_iowa['year'] == 2015].groupby(['store_number'])[mean_metrics].agg(np.mean)
iowa_mean_2015.columns = ['2015_avg_rev','2015_avg_profit','2015_avg_price_per_ml', \
                         '2015_avg_profit_per_ml', '2015_avg_profit_per_bottle']
iowa_mean_2015.reset_index(inplace=True)
print len(iowa_mean_2015)
iowa_mean_2015.describe(include='all')

1176


Unnamed: 0,store_number,2015_avg_rev,2015_avg_profit,2015_avg_price_per_ml,2015_avg_profit_per_ml,2015_avg_profit_per_bottle
count,1176.0,1176.0,1176.0,1176.0,1176.0,1176.0
mean,4167.448129,124.013872,41.4877,4.707516,5.68664,41.4877
std,813.040577,117.894809,39.433422,2.915591,1.251868,39.433422
min,2106.0,24.757826,8.268261,0.543021,3.173605,8.268261
25%,3807.5,76.549265,25.575864,2.188248,4.944523,25.575864
50%,4379.5,106.129977,35.552276,4.269049,5.536215,35.552276
75%,4742.25,143.523493,48.12428,6.395731,6.23182,48.12428
max,9010.0,2061.811833,691.704,19.266372,27.428178,691.704


In [90]:
# Create a df summarizing total sales and revenues in 2015 at store-level
sum_metrics = ['sales', 'profit_per_bottle']
iowa_sum_2015Q1 = full_iowa[full_iowa['quarter'] == 'Q1-2015'].groupby(['store_number'])[sum_metrics].agg(np.sum)
iowa_sum_2015Q1.columns = ['Q1-15_total_revenue','Q1-15_profit']
iowa_sum_2015Q1.reset_index(inplace=True)
print len(iowa_sum_2015Q1)

# Create a df summarizing mean metrics
mean_metrics = ['sales', 'profit_per_bottle', 'price_per_ml', 'profit_per_ml']
iowa_mean_2015Q1 = full_iowa[full_iowa['quarter'] == 'Q1-2015'].groupby(['store'])[mean_metrics].agg(np.mean)
iowa_mean_2015Q1.columns = ['Q1-15_avg_rev','Q1-15_avg_profit','Q1-15_avg_price_per_ml', \
                         'Q1-15_avg_profit_per_ml', 'Q1-15_avg_profit_per_bottle']
iowa_mean_2015Q1.reset_index(inplace=True)
print len(iowa_mean_2015Q1)

NameError: name 'iowa_full' is not defined

# Build your models

Using scikit-learn or statsmodels, build the necessary models for your scenario. Evaluate model fit.

In [None]:
from sklearn import linear_model


## Plot your results

Again make sure that you record any valuable information. For example, in the tax scenario, did you find the sales from the first three months of the year to be a good predictor of the total sales for the year? Plot the predictions versus the true values and discuss the successes and limitations of your models

# Present the Results

Present your conclusions and results. If you have more than one interesting model feel free to include more than one along with a discussion. Use your work in this notebook to prepare your write-up.