## This is our workspace for statistical analysis!

Add cells, do exploratory analysis, go ham.

In [69]:
# feel free to add more

import sqlite3
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.tools import eval_measures

In [70]:
# Create connection to database
conn = sqlite3.connect('../data/db/avocado_project.db')
c = conn.cursor()

In [71]:
list_tables_cmd = "SELECT name FROM sqlite_master WHERE type='table';"
c.execute(list_tables_cmd)
conn.commit()

for row in c:
  print(row)

('demographic',)
('hab2016',)
('hab2017',)
('hab2018',)


In [72]:
# Load all tables into dataframe
query1 = "SELECT * FROM demographic"
demo_df = pd.read_sql(query1, conn)

query2 = "SELECT * FROM hab2016"
hab2016_df = pd.read_sql(query2, conn)
hab2016_df.rename(columns={'units_current_year': '2016_units_current_year',
                          'dollars_current_year': '2016_dollars_current_year',
                          'asp_current_year': '2016_asp_current_year'}, inplace=True)

query3 = "SELECT * FROM hab2017"
hab2017_df = pd.read_sql(query3, conn)
hab2017_df.rename(columns={'units_current_year': '2017_units_current_year',
                          'dollars_current_year': '2017_dollars_current_year',
                          'asp_current_year': '2017_asp_current_year'}, inplace=True)

query4 = "SELECT * FROM hab2018"
hab2018_df = pd.read_sql(query4, conn)
hab2018_df.rename(columns={'units_current_year': '2018_units_current_year',
                          'dollars_current_year': '2018_dollars_current_year',
                          'asp_current_year': '2018_asp_current_year'}, inplace=True)

In [73]:
mean_pop = []
median_pop = []
stddev_pop = []

for index, row in demo_df.iterrows():
    pop_list = [row['pop2016'], row['pop2017'], row['pop2018']]
    means = np.mean(pop_list)
    median = np.median(pop_list)
    std_dev = np.std(pop_list)
    mean_pop.append(means)
    median_pop.append(median)
    stddev_pop.append(std_dev)
    
demo_df["Mean Population"] = mean_pop
demo_df["Median Population"] = median_pop
demo_df["Standard Deviation Population"] = stddev_pop
demo_df

#we might want to remove the rows with age 999 as its just an aggregation of all the previous rows of that region


Unnamed: 0,region,age,sex,pop2016,pop2017,pop2018,Mean Population,Median Population,Standard Deviation Population
0,California,0,F,239546,235400,233156,2.360340e+05,235400.0,2646.946921
1,California,0,M,251535,246740,244164,2.474797e+05,246740.0,3054.312834
2,California,1,F,243956,239361,234983,2.394333e+05,239361.0,3663.568964
3,California,1,M,256635,250616,245824,2.510250e+05,250616.0,4423.037493
4,California,2,F,243382,243308,238803,2.418310e+05,243308.0,2141.332451
...,...,...,...,...,...,...,...,...,...
1387,West,84,M,43082,44018,44212,4.377067e+04,44018.0,493.359459
1388,West,85,F,379224,384587,389779,3.845300e+05,384587.0,4309.249200
1389,West,85,M,230056,236742,243372,2.367233e+05,236742.0,5436.250260
1390,West,999,F,17599328,17856296,18114533,1.785672e+07,17856296.0,210331.773163


In [88]:
region = []
current_units_mean = []
current_units_median = []
current_units_stddev = []

current_dollars_mean = []
current_dollars_median = []
current_dollars_stddev = []

current_asp_mean = []
current_asp_median = []
current_asp_stddev = []

for index, row in hab2016_df.iterrows(): 
    region.append(row['region'])
    for index1, row1 in hab2017_df.iterrows():
        for index2, row2 in hab2018_df.iterrows():
            current_units_list = [row['2016_units_current_year'], row1['2017_units_current_year'], row2['2018_units_current_year']]
            current_dollars_list = [row['2016_dollars_current_year'], row1['2017_dollars_current_year'], row2['2018_dollars_current_year']]
            current_asp_list = [row['2016_asp_current_year'], row1['2017_asp_current_year'], row2['2018_asp_current_year']]
            
    units_mean = np.mean(current_units_list)
    units_median = np.median(current_units_list)
    units_std_dev = np.std(current_units_list)
    current_units_mean.append(units_mean)
    current_units_median.append(units_median)
    current_units_stddev.append(units_std_dev)
            
    dollars_mean = np.mean(current_dollars_list)
    dollars_median = np.median(current_dollars_list)
    dollars_std_dev = np.std(current_dollars_list)
    current_dollars_mean.append(dollars_mean)
    current_dollars_median.append(dollars_median)
    current_dollars_stddev.append(dollars_std_dev)
            
    asp_mean = np.mean(current_asp_list)  
    asp_median = np.median(current_asp_list)
    asp_std_dev = np.std(current_asp_list)  
    current_asp_mean.append(asp_mean)
    current_asp_median.append(asp_median)
    current_asp_stddev.append(asp_std_dev)
            
combined_avo_data = {'region' : region,
                     'units_current_year_mean' : current_units_mean, 
                     'units_current_year_median' : current_units_median,
                    'units_current_year_std_dev' : current_units_stddev,
                    'dollars_current_year_mean' : current_dollars_mean, 
                     'dollars_current_year_median' : current_dollars_median,
                    'dollars_current_year_std_dev' : current_dollars_stddev,
                    'asp_current_year_mean' : current_asp_mean, 
                     'asp_current_year_median' : current_asp_median,
                    'asp_current_year_std_dev' : current_asp_stddev}

combined_avo_df = pd.DataFrame(combined_avo_data)
combined_avo_df

Unnamed: 0,region,units_current_year_mean,units_current_year_median,units_current_year_std_dev,dollars_current_year_mean,dollars_current_year_median,dollars_current_year_std_dev,asp_current_year_mean,asp_current_year_median,asp_current_year_std_dev
0,California,326786200.0,330751700.0,41338360.0,368458500.0,389151498.2,37291480.0,14.959181,14.193095,1.232576
1,Great Lakes,288201400.0,330751700.0,93474350.0,327098200.0,389151498.2,95618740.0,15.182477,14.862985,1.129972
2,Midsouth,284356600.0,330751700.0,98813420.0,321417300.0,389151498.2,103644600.0,15.074071,14.537764,1.170304
3,Northeast,304616400.0,330751700.0,70848310.0,351590300.0,389151498.2,61041250.0,15.462296,15.702439,1.120104
4,Plains,264707300.0,330751700.0,126230500.0,296382600.0,389151498.2,139024400.0,14.890975,13.988476,1.277909
5,South Central,328647400.0,330751700.0,38992080.0,346279200.0,389151498.2,68534320.0,13.852551,13.986238,2.379927
6,Southeast,293874900.0,330751700.0,85618560.0,328704000.0,389151498.2,93350410.0,14.85306,13.986238,1.305512
7,West,329417700.0,330751700.0,38032010.0,362754900.0,389151498.2,45309500.0,14.58299,13.986238,1.542295
