# Where Wild Harvested Produce is More Easily Found

### This notebook contains an analysis to predict which region, state and city within the U.S. that is most likely to sell wild harvested produce. 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

%matplotlib inline

import statsmodels.api as sm

In [2]:
#import the file and check the first and last rows

markets = pd.read_csv("../GA-DAT-LA-07-Project/Export-5.csv")
markets.head()


Unnamed: 0,FMID,MarketName,Website,Facebook,Twitter,Youtube,OtherMedia,street,city,County,...,Coffee,Beans,Fruits,Grains,Juices,Mushrooms,PetFood,Tofu,WildHarvested,updateTime
0,1000618,100-Mile Market,http://www.peoplesfoodco-op.org/,,,,,507 Harrison Street,Kalamazoo,Kalamazoo,...,Y,N,Y,N,N,Y,N,N,Y,4/24/2014 18:22
1,1009364,106 S. Main Street Farmers Market,http://thetownofsixmile.wordpress.com/,,,,,106 S. Main Street,Six Mile,,...,,,,,,,,,,2013
2,1010691,10th Steet Community Farmers Market,,,,,http://agrimissouri.com/mo-grown/grodetail.php...,10th Street and Poplar,Lamar,Barton,...,N,N,Y,N,N,N,N,N,N,10/28/2014 9:49
3,1002454,112st Madison Avenue,,,,,,112th Madison Avenue,New York,New York,...,N,N,N,N,N,N,N,N,N,3/1/2012 10:38
4,1011100,12 South Farmers Market,http://www.12southfarmersmarket.com,12_South_Farmers_Market,@12southfrmsmkt,,@12southfrmsmkt,3000 Granny White Pike,Nashville,Davidson,...,Y,N,Y,N,Y,Y,Y,N,N,5/1/2015 10:40


In [3]:
# Confirm number of records loaded into python notebook.  
# Note data is from 7/31/2015 download.  http://search.ams.usda.gov/farmersmarkets/

markets.describe()

Unnamed: 0,FMID,x,y
count,8476.0,8446.0,8446.0
mean,1008934.303563,-91.128384,39.248774
std,57157.734676,17.426453,5.082776
min,1000001.0,-166.54,17.7099
25%,1002781.75,-97.330475,36.410976
50%,1005575.0,-86.468321,40.062825
75%,1008562.25,-77.671578,42.44967
max,2000036.0,-64.7043,64.86275


In [4]:
# What data do I have to analyze?

markets.columns

Index(['FMID', 'MarketName', 'Website', 'Facebook', 'Twitter', 'Youtube',
       'OtherMedia', 'street', 'city', 'County', 'State', 'zip', 'region',
       'Season1Date', 'Season1Time', 'Season2Date', 'Season2Time',
       'Season3Date', 'Season3Time', 'Season4Date', 'Season4Time', 'x', 'y',
       'Location', 'Credit', 'WIC', 'WICcash', 'SFMNP', 'SNAP', 'Organic',
       'Bakedgoods', 'Cheese', 'Crafts', 'Flowers', 'Eggs', 'Seafood', 'Herbs',
       'Vegetables', 'Honey', 'Jams', 'Maple', 'Meat', 'Nursery', 'Nuts',
       'Plants', 'Poultry', 'Prepared', 'Soap', 'Trees', 'Wine', 'Coffee',
       'Beans', 'Fruits', 'Grains', 'Juices', 'Mushrooms', 'PetFood', 'Tofu',
       'WildHarvested', 'updateTime'],
      dtype='object')

In [5]:
# Replace all nan's with blank text

markets = markets.replace(np.nan,' ',regex=True) 
markets.head()

Unnamed: 0,FMID,MarketName,Website,Facebook,Twitter,Youtube,OtherMedia,street,city,County,...,Coffee,Beans,Fruits,Grains,Juices,Mushrooms,PetFood,Tofu,WildHarvested,updateTime
0,1000618,100-Mile Market,http://www.peoplesfoodco-op.org/,,,,,507 Harrison Street,Kalamazoo,Kalamazoo,...,Y,N,Y,N,N,Y,N,N,Y,4/24/2014 18:22
1,1009364,106 S. Main Street Farmers Market,http://thetownofsixmile.wordpress.com/,,,,,106 S. Main Street,Six Mile,,...,,,,,,,,,,2013
2,1010691,10th Steet Community Farmers Market,,,,,http://agrimissouri.com/mo-grown/grodetail.php...,10th Street and Poplar,Lamar,Barton,...,N,N,Y,N,N,N,N,N,N,10/28/2014 9:49
3,1002454,112st Madison Avenue,,,,,,112th Madison Avenue,New York,New York,...,N,N,N,N,N,N,N,N,N,3/1/2012 10:38
4,1011100,12 South Farmers Market,http://www.12southfarmersmarket.com,12_South_Farmers_Market,@12southfrmsmkt,,@12southfrmsmkt,3000 Granny White Pike,Nashville,Davidson,...,Y,N,Y,N,Y,Y,Y,N,N,5/1/2015 10:40


In [6]:
# Create a full address column.  May be use as "string of text" and then can apply 
# the text analysis learned in Class 15 - 17

markets['address'] = markets['street'] + markets['city'] + markets['State'] + markets['zip']
markets.columns

Index(['FMID', 'MarketName', 'Website', 'Facebook', 'Twitter', 'Youtube',
       'OtherMedia', 'street', 'city', 'County', 'State', 'zip', 'region',
       'Season1Date', 'Season1Time', 'Season2Date', 'Season2Time',
       'Season3Date', 'Season3Time', 'Season4Date', 'Season4Time', 'x', 'y',
       'Location', 'Credit', 'WIC', 'WICcash', 'SFMNP', 'SNAP', 'Organic',
       'Bakedgoods', 'Cheese', 'Crafts', 'Flowers', 'Eggs', 'Seafood', 'Herbs',
       'Vegetables', 'Honey', 'Jams', 'Maple', 'Meat', 'Nursery', 'Nuts',
       'Plants', 'Poultry', 'Prepared', 'Soap', 'Trees', 'Wine', 'Coffee',
       'Beans', 'Fruits', 'Grains', 'Juices', 'Mushrooms', 'PetFood', 'Tofu',
       'WildHarvested', 'updateTime', 'address'],
      dtype='object')

In [7]:
# Replace blank with 0, N with 0, and Y with 1 for all product columns.

markets['cleanOrganic'] = markets['Organic'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanBakedgoods'] = markets['Bakedgoods'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanCheese'] = markets['Cheese'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanCrafts'] = markets['Crafts'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanFlowers'] = markets['Flowers'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanEggs'] = markets['Eggs'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanSeafood'] = markets['Seafood'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanHerbs'] = markets['Herbs'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanVegetables'] = markets['Vegetables'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanHoney'] = markets['Honey'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanJams'] = markets['Jams'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanMaple'] = markets['Maple'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanMeat'] = markets['Meat'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanNursery'] = markets['Nursery'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanPlants'] = markets['Plants'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanPoultry'] = markets['Poultry'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanPrepared'] = markets['Prepared'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanSoap'] = markets['Soap'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanTrees'] = markets['Trees'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanWine'] = markets['Wine'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanCoffee'] = markets['Coffee'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanBeans'] = markets['Beans'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanFruits'] = markets['Fruits'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanGrains'] = markets['Grains'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanJuices'] = markets['Juices'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanMushrooms'] = markets['Mushrooms'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanPetFood'] = markets['PetFood'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanTofu'] = markets['Tofu'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)
markets['cleanWildHarvested'] = markets['WildHarvested'].replace({'N':0},regex=True).replace({'Y':1},regex=True).replace({' ':0},regex=True)


markets.head()


Unnamed: 0,FMID,MarketName,Website,Facebook,Twitter,Youtube,OtherMedia,street,city,County,...,cleanWine,cleanCoffee,cleanBeans,cleanFruits,cleanGrains,cleanJuices,cleanMushrooms,cleanPetFood,cleanTofu,cleanWildHarvested
0,1000618,100-Mile Market,http://www.peoplesfoodco-op.org/,,,,,507 Harrison Street,Kalamazoo,Kalamazoo,...,0,1,0,1,0,0,1,0,0,1
1,1009364,106 S. Main Street Farmers Market,http://thetownofsixmile.wordpress.com/,,,,,106 S. Main Street,Six Mile,,...,0,0,0,0,0,0,0,0,0,0
2,1010691,10th Steet Community Farmers Market,,,,,http://agrimissouri.com/mo-grown/grodetail.php...,10th Street and Poplar,Lamar,Barton,...,0,0,0,1,0,0,0,0,0,0
3,1002454,112st Madison Avenue,,,,,,112th Madison Avenue,New York,New York,...,0,0,0,0,0,0,0,0,0,0
4,1011100,12 South Farmers Market,http://www.12southfarmersmarket.com,12_South_Farmers_Market,@12southfrmsmkt,,@12southfrmsmkt,3000 Granny White Pike,Nashville,Davidson,...,0,1,0,1,0,1,1,1,0,0


In [8]:
#drop old product columns
markets2 = markets.drop('Organic',1).drop('Bakedgoods',1).drop('Cheese',1).drop('Crafts',1).drop('Flowers',1).drop('Eggs',1).drop('Seafood',1).drop('Herbs',1).drop('Vegetables',1).drop('Honey',1).drop('Jams',1).drop('Maple',1).drop('Meat',1).drop('Nursery',1).drop('Nuts',1).drop('Plants',1).drop('Poultry',1).drop('Prepared',1).drop('Soap',1).drop('Trees',1).drop('Wine',1).drop('Coffee',1).drop('Beans',1).drop('Fruits',1).drop('Grains',1).drop('Juices',1).drop('Mushrooms',1).drop('PetFood',1).drop('Tofu',1).drop('WildHarvested',1)

#print columns for new temp table
markets2.columns

Index(['FMID', 'MarketName', 'Website', 'Facebook', 'Twitter', 'Youtube',
       'OtherMedia', 'street', 'city', 'County', 'State', 'zip', 'region',
       'Season1Date', 'Season1Time', 'Season2Date', 'Season2Time',
       'Season3Date', 'Season3Time', 'Season4Date', 'Season4Time', 'x', 'y',
       'Location', 'Credit', 'WIC', 'WICcash', 'SFMNP', 'SNAP', 'updateTime',
       'address', 'cleanOrganic', 'cleanBakedgoods', 'cleanCheese',
       'cleanCrafts', 'cleanFlowers', 'cleanEggs', 'cleanSeafood',
       'cleanHerbs', 'cleanVegetables', 'cleanHoney', 'cleanJams',
       'cleanMaple', 'cleanMeat', 'cleanNursery', 'cleanPlants',
       'cleanPoultry', 'cleanPrepared', 'cleanSoap', 'cleanTrees', 'cleanWine',
       'cleanCoffee', 'cleanBeans', 'cleanFruits', 'cleanGrains',
       'cleanJuices', 'cleanMushrooms', 'cleanPetFood', 'cleanTofu',
       'cleanWildHarvested'],
      dtype='object')

### Region Where Wild Harvested Produce is Most Easily Found

In [9]:
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split

# Create the train and test sets
X = markets2[['cleanWildHarvested']].values
y = markets2.region.ravel()

# create 80%-20% train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [10]:
from sklearn.neighbors import KNeighborsClassifier

print("Shape of Training Sets")
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print()

neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X, y) 
train_score = neigh.score(X_train,y_train)
cv_score = neigh.score(X_test,y_test)

print("Most Likely Region without Wild Harvested Produce: ", neigh.predict([[0]]))
print("Most Likely Region with Wild Harvested Produce: ", neigh.predict([[1]]))
print()
print("Predicted Probabilities: ",neigh.predict_proba([[3000]]))  
print()
print("Train Score: ",train_score)  
print("CV Score: ",cv_score)  

Shape of Training Sets
X_train:  (6780, 1)
y_train:  (6780,)

Most Likely Region without Wild Harvested Produce:  ['Northeast']
Most Likely Region with Wild Harvested Produce:  ['Midnorth']

Predicted Probabilities:  [[ 0.  1.  0.  0.  0.  0.  0.  0.]]

Train Score:  0.23790560472
CV Score:  0.231721698113


### State Where Wild Harvested Produce is Most Easily Found

In [11]:
# Create the train and test sets
X = markets2[['cleanWildHarvested']].values
y = markets2.State.ravel()

# create 80%-20% train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("Shape of Training Sets")
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print()

neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X, y) 
train_score = neigh.score(X_train,y_train)
cv_score = neigh.score(X_test,y_test)

print("Most Likely State without Wild Harvested Produce: ", neigh.predict([[-1]]))
print("Most Likely State with Wild Harvested Produce: ", neigh.predict([[3000]]))
print()
print("Predicted Probabilities: ",neigh.predict_proba([[3000]]))  
print()
print("Train Score: ",train_score)  
print("CV Score: ",cv_score) 

Shape of Training Sets
X_train:  (6780, 1)
y_train:  (6780,)

Most Likely State without Wild Harvested Produce:  ['Pennsylvania']
Most Likely State with Wild Harvested Produce:  ['Iowa']

Predicted Probabilities:  [[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]

Train Score:  0.0370206489676
CV Score:  0.0306603773585


### City Where Wild Harvested Produce is Most Easily Found

In [12]:
# Create the train and test sets
X = markets2[['cleanWildHarvested']].values
y = markets2.city.ravel()

# create 80%-20% train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("Shape of Training Sets")
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print()

neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X, y) 
train_score = neigh.score(X_train,y_train)
cv_score = neigh.score(X_test,y_test)

print("Most Likely City without Wild Harvested Produce: ", neigh.predict([[-1]]))
print("Most Likely City with Wild Harvested Produce: ", neigh.predict([[2000]]))
print()
print("Predicted Probabilities: ",neigh.predict_proba([[900]]))  
print()
print("Train Score: ",train_score)  
print("CV Score: ",cv_score) 

Shape of Training Sets
X_train:  (6780, 1)
y_train:  (6780,)

Most Likely City without Wild Harvested Produce:  ['Mount Bethe']
Most Likely City with Wild Harvested Produce:  ['Mt. Pleasant']

Predicted Probabilities:  [[ 0.  0.  0. ...,  0.  0.  0.]]

Train Score:  0.000147492625369
CV Score:  0.000589622641509
