In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from config import db_password

In [3]:
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

# Run sqlalchemy to import data and process for machine learning

In [4]:
db_string = f"postgresql://postgres:{db_password}@database-3.csjzcnuh5mqe.us-east-2.rds.amazonaws.com:5432/Final Project"

In [5]:
engine = create_engine(db_string)
conn = engine.connect()

In [6]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [7]:
# Create session (link) from Python to the DB
session = Session(engine)

In [8]:
#import from SQL
final_sql = 'SELECT * from Final'

In [9]:
final_sql_df= pd.read_sql(final_sql, conn)

In [10]:
final_sql_df.head(10)

Unnamed: 0,city,population,violentcrime,murder,rape,robbery,aggassault,propertycrime,burglary,theft,...,mar,apr,may,jun,jul,aug,sep,oct,nov,dec
0,Hanford,57232,257,0,18,40,199,1242,131,900,...,208928.5,208596.0,209062.5,209226.0,209146.0,208906.5,209243.5,209646.0,210756.5,213125.5
1,Redlands,71941,257,1,40,70,146,2108,330,1534,...,428394.0,428854.5,430418.5,430988.5,431227.0,433337.5,435738.0,438494.0,439358.5,440008.5
2,Glendale,202601,231,5,16,93,117,3305,480,2562,...,849620.5,848568.5,849746.0,852006.0,848913.0,848468.0,850573.0,857070.0,860846.0,865203.0
3,San Bernardino,216715,2858,46,140,906,1766,9081,2029,4974,...,269785.0,271124.0,272944.0,273867.0,274452.0,275822.0,277041.0,278795.0,279903.0,281112.0
4,Grass Valley,12919,73,3,0,6,64,463,76,326,...,400544.5,405139.0,406816.5,405919.0,402539.5,400859.5,399217.0,398306.5,398893.5,401141.5
5,San Clemente,65018,87,1,3,29,54,918,160,700,...,1024563.5,1026832.0,1030890.0,1031889.0,1035573.5,1038347.5,1044498.0,1046669.5,1043087.5,1039332.0
6,San Luis Obispo,47735,192,0,44,34,114,1738,277,1387,...,736393.0,739308.0,743372.0,741471.0,734373.0,733805.0,735289.0,740057.0,739555.0,739474.0
7,Fremont,240887,400,1,36,159,204,4523,547,3408,...,1103707.5,1098956.5,1090527.0,1080254.0,1073585.5,1065582.5,1060474.0,1054337.0,1049268.0,1049423.0
8,Modesto,216542,1758,13,94,399,1252,7183,1149,4849,...,286789.0,289319.0,290254.0,291741.0,290716.5,291209.5,291389.0,293734.5,295003.5,296914.0
9,Santa Monica,91621,664,3,40,247,374,3964,577,3143,...,1410872.0,1404153.0,1407314.0,1412127.0,1389853.0,1376021.0,1370802.0,1392114.0,1400247.0,1408692.0


In [11]:
final_sql_df.columns

Index(['city', 'population', 'violentcrime', 'murder', 'rape', 'robbery',
       'aggassault', 'propertycrime', 'burglary', 'theft', 'vehicletheft',
       'arson', 'firecount', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
       'aug', 'sep', 'oct', 'nov', 'dec'],
      dtype='object')

In [12]:
#find and create column for yearly median house price
yearly_median = final_sql_df.groupby('city')[['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
       'aug', 'sep', 'oct', 'nov', 'dec']].apply(np.median)

In [13]:
yearly_median

city
Adelanto        237652.5
Anaheim         595720.0
Anderson        242715.0
Antioch         538109.5
Apple Valley    265392.5
                  ...   
Winters         458396.0
Woodlake        190431.5
Woodland        387797.5
Yuba City       313604.0
Yucca Valley    223926.5
Length: 229, dtype: float64

In [14]:
yearly_median_df = pd.DataFrame(yearly_median)
yearly_median_df.columns = ['yearly median']

yearly_median_df = pd.DataFrame.reset_index(yearly_median_df)

yearly_median_df.head(10)

Unnamed: 0,city,yearly median
0,Adelanto,237652.5
1,Anaheim,595720.0
2,Anderson,242715.0
3,Antioch,538109.5
4,Apple Valley,265392.5
5,Arvin,180601.5
6,Atascadero,536914.0
7,Atwater,269412.5
8,Auburn,477907.5
9,Avenal,136938.0


In [15]:
# Merge yearly_median_df into df with the final_sql_df
final_analysis_df = pd.merge(final_sql_df, yearly_median_df, on='city')

final_analysis_df.head()

Unnamed: 0,city,population,violentcrime,murder,rape,robbery,aggassault,propertycrime,burglary,theft,...,apr,may,jun,jul,aug,sep,oct,nov,dec,yearly median
0,Hanford,57232,257,0,18,40,199,1242,131,900,...,208596.0,209062.5,209226.0,209146.0,208906.5,209243.5,209646.0,210756.5,213125.5,209104.25
1,Redlands,71941,257,1,40,70,146,2108,330,1534,...,428854.5,430418.5,430988.5,431227.0,433337.5,435738.0,438494.0,439358.5,440008.5,431107.75
2,Glendale,202601,231,5,16,93,117,3305,480,2562,...,848568.5,849746.0,852006.0,848913.0,848468.0,850573.0,857070.0,860846.0,865203.0,851289.5
3,San Bernardino,216715,2858,46,140,906,1766,9081,2029,4974,...,271124.0,272944.0,273867.0,274452.0,275822.0,277041.0,278795.0,279903.0,281112.0,274159.5
4,Grass Valley,12919,73,3,0,6,64,463,76,326,...,405139.0,406816.5,405919.0,402539.5,400859.5,399217.0,398306.5,398893.5,401141.5,400702.0


# Shortcut for personal use offline

In [None]:
# final_analysis_df.to_csv('final_analysis_df.csv')

In [None]:
# final_analysis_df= pd.read_csv('../Resources/final_analysis_df.csv')
# final_analysis_df.head()

# Machine learning setup

## Split the Data into Training and Testing

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDRegressor

In [17]:
final_analysis_df= final_analysis_df.drop(columns=['city'])
y = final_analysis_df['yearly median']
X = final_analysis_df.drop(columns=['yearly median'])

# scale the input using a pipeline.
reg = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3))
reg.fit(X, y)


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdregressor', SGDRegressor())])

### Test 1

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)
X_train.shape

(171, 24)

In [41]:
y_pred = reg.predict(X_test)

In [42]:
reg.score(X_test, y_test)

0.6944145451269192

In [43]:
reg.predict(X_test)

array([ 835996.63702865,  307700.46743549,  271206.30939876,
        408472.119039  ,  461099.14666645,  409640.03946745,
        729474.55875069,  381191.6567043 ,  723436.99160187,
        331232.06266549,  523410.14870439,  267458.69764862,
        421253.9320221 ,  415042.27719479, 1413699.6601205 ,
        335764.33589218,  585721.99400215, 1080371.1428578 ,
       1038943.19836534,  430591.87205025,  581706.70288642,
        332369.56535248,  705797.46325724,  495885.51909518,
        708021.31349166,  703951.94465921,  551455.36758892,
        769035.20740307,  381440.07546251,  655737.9071536 ,
        289875.03753522,  528211.83048332, -524607.560168  ,
       1127244.46447574,  346349.58095225,  643969.62712454,
        287962.08893781,  428209.98740492,  442582.59527799,
        440285.60476312,  732085.28786707,  294411.04895107,
        102709.08067325,  676184.84757276,  439654.68622893,
        938777.06263725,  389019.26793943,  574412.96265613,
        720666.42301692,

### Test 2

In [44]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X,y, random_state=2)
X_train2.shape

(171, 24)

In [45]:
y_pred2 = reg.predict(X_test2)

In [46]:
reg.score(X_test2, y_test2)

0.3917172560539638

In [47]:
reg.predict(X_test2)

array([ 191501.68649052,  495885.51909518,  628983.25775822,
        330834.26426006,  337069.60824764, 1019739.88810631,
        460455.57609143,  271206.30939876, -187363.27207927,
        635419.34400878,  331232.06266549,  484855.11091922,
        670304.05746085,  591744.14784041,  377182.47966922,
        487218.78967136,  561726.55984548,  498289.25617841,
        938777.06263725, -600483.81092385,  951283.55346   ,
        365671.12399881,  155795.75153848,  442582.59527799,
        555687.96119183,  294000.22743121,  -17473.90780695,
       1312385.49890921,  320073.70373057,  603429.42985416,
        339946.60272094,  418737.19445594,  377066.49668014,
        306601.6664653 ,  302931.80334909,  260260.56938924,
        565379.81877087,  708021.31349166,  773480.55051931,
        389019.26793943,  420378.97299652,  732085.28786707,
        102709.08067325,  381440.07546251,  310795.97372436,
       1095126.21565131,  605800.31030703,  794240.95359626,
        522649.67797087,

### Test 3

In [48]:
X_train3, X_test3, y_train3, y_test3= train_test_split(X,y, random_state=3)
X_train3.shape

(171, 24)

In [49]:
y_pred3 = reg.predict(X_test3)

In [50]:
reg.score(X_test3, y_test3)

0.5154955054952604

In [51]:
reg.predict(X_test3)

array([ 1080371.1428578 ,   290805.13974209,   332837.90885722,
         332050.09926081,  -314651.62030464,   834773.22870883,
         585721.99400215,   409640.03946745,   451905.51297199,
        1197098.66623286,   555687.96119183,   569251.94350719,
         471422.32279268,   551455.36758892,   546024.10787684,
         453471.14732763,  1038531.61782837,   531334.4797297 ,
         280488.02351216,   378327.18583338,   331986.04142445,
         260260.56938924,  1038943.19836534,   389925.36447208,
         445365.74864254,   386929.63570002,   571286.26640471,
         438768.55206993,   498289.25617841,   731723.43231529,
        2136990.20176681,   827852.44857743,   942269.02141765,
         523794.08693844,   487218.78967136,   753762.25090059,
         523410.14870439,   729474.55875069,   711307.3757382 ,
         599344.56380243,   332369.56535248,   377066.49668014,
         557116.40528707,   374634.61400631,   330834.26426006,
         559058.34439175,   670304.05746

### Test 4

In [53]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X,y,test_size=.7, train_size=.3, random_state=1)
X_train4.shape

(68, 24)

In [54]:
y_pred4 = reg.predict(X_test4)

In [55]:
reg.score(X_test4, y_test4)

-2.2358013588800905

In [56]:
reg.predict(X_test4)

array([  835996.63702865,   307700.46743549,   271206.30939876,
         408472.119039  ,   461099.14666645,   409640.03946745,
         729474.55875069,   381191.6567043 ,   723436.99160187,
         331232.06266549,   523410.14870439,   267458.69764862,
         421253.9320221 ,   415042.27719479,  1413699.6601205 ,
         335764.33589218,   585721.99400215,  1080371.1428578 ,
        1038943.19836534,   430591.87205025,   581706.70288642,
         332369.56535248,   705797.46325724,   495885.51909518,
         708021.31349166,   703951.94465921,   551455.36758892,
         769035.20740307,   381440.07546251,   655737.9071536 ,
         289875.03753522,   528211.83048332,  -524607.560168  ,
        1127244.46447574,   346349.58095225,   643969.62712454,
         287962.08893781,   428209.98740492,   442582.59527799,
         440285.60476312,   732085.28786707,   294411.04895107,
         102709.08067325,   676184.84757276,   439654.68622893,
         938777.06263725,   389019.26793