In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from config import db_password

In [2]:
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

In [3]:
db_string = f"postgresql://postgres:{db_password}@database-3.csjzcnuh5mqe.us-east-2.rds.amazonaws.com:5432/Final Project"

In [4]:
engine = create_engine(db_string)
conn = engine.connect()

In [5]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [6]:
# Create session (link) from Python to the DB
session = Session(engine)

In [7]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [8]:
#import from SQL
final_sql = 'SELECT * from Final'

In [9]:
final_sql_df= pd.read_sql(final_sql, conn)

In [10]:
final_sql_df.head(10)

Unnamed: 0,city,population,violentcrime,murder,rape,robbery,aggassault,propertycrime,burglary,theft,...,mar,apr,may,jun,jul,aug,sep,oct,nov,dec
0,Hanford,57232,257,0,18,40,199,1242,131,900,...,208928.5,208596.0,209062.5,209226.0,209146.0,208906.5,209243.5,209646.0,210756.5,213125.5
1,Redlands,71941,257,1,40,70,146,2108,330,1534,...,428394.0,428854.5,430418.5,430988.5,431227.0,433337.5,435738.0,438494.0,439358.5,440008.5
2,Glendale,202601,231,5,16,93,117,3305,480,2562,...,849620.5,848568.5,849746.0,852006.0,848913.0,848468.0,850573.0,857070.0,860846.0,865203.0
3,San Bernardino,216715,2858,46,140,906,1766,9081,2029,4974,...,269785.0,271124.0,272944.0,273867.0,274452.0,275822.0,277041.0,278795.0,279903.0,281112.0
4,Grass Valley,12919,73,3,0,6,64,463,76,326,...,400544.5,405139.0,406816.5,405919.0,402539.5,400859.5,399217.0,398306.5,398893.5,401141.5
5,San Clemente,65018,87,1,3,29,54,918,160,700,...,1024563.5,1026832.0,1030890.0,1031889.0,1035573.5,1038347.5,1044498.0,1046669.5,1043087.5,1039332.0
6,San Luis Obispo,47735,192,0,44,34,114,1738,277,1387,...,736393.0,739308.0,743372.0,741471.0,734373.0,733805.0,735289.0,740057.0,739555.0,739474.0
7,Fremont,240887,400,1,36,159,204,4523,547,3408,...,1103707.5,1098956.5,1090527.0,1080254.0,1073585.5,1065582.5,1060474.0,1054337.0,1049268.0,1049423.0
8,Modesto,216542,1758,13,94,399,1252,7183,1149,4849,...,286789.0,289319.0,290254.0,291741.0,290716.5,291209.5,291389.0,293734.5,295003.5,296914.0
9,Santa Monica,91621,664,3,40,247,374,3964,577,3143,...,1410872.0,1404153.0,1407314.0,1412127.0,1389853.0,1376021.0,1370802.0,1392114.0,1400247.0,1408692.0


In [11]:
final_sql_df.columns

Index(['city', 'population', 'violentcrime', 'murder', 'rape', 'robbery',
       'aggassault', 'propertycrime', 'burglary', 'theft', 'vehicletheft',
       'arson', 'firecount', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
       'aug', 'sep', 'oct', 'nov', 'dec'],
      dtype='object')

In [12]:
#find and create column for yearly median house price
yearly_median = final_sql_df.groupby('city')[['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
       'aug', 'sep', 'oct', 'nov', 'dec']].apply(np.median)

In [19]:
yearly_median

city
Adelanto        237652.5
Anaheim         595720.0
Anderson        242715.0
Antioch         538109.5
Apple Valley    265392.5
                  ...   
Winters         458396.0
Woodlake        190431.5
Woodland        387797.5
Yuba City       313604.0
Yucca Valley    223926.5
Length: 229, dtype: float64

In [15]:
yearly_median_df = pd.DataFrame(yearly_median)
yearly_median_df.columns = ['yearly median']

yearly_median_df = pd.DataFrame.reset_index(yearly_median_df)

yearly_median_df.head(10)

Unnamed: 0,city,yearly median
0,Adelanto,237652.5
1,Anaheim,595720.0
2,Anderson,242715.0
3,Antioch,538109.5
4,Apple Valley,265392.5
5,Arvin,180601.5
6,Atascadero,536914.0
7,Atwater,269412.5
8,Auburn,477907.5
9,Avenal,136938.0


In [16]:
# Merge yearly_median_df into df with the final_sql_df
final_analysis_df = pd.merge(final_sql_df, yearly_median_df, on='city')

final_analysis_df.head()

Unnamed: 0,city,population,violentcrime,murder,rape,robbery,aggassault,propertycrime,burglary,theft,...,apr,may,jun,jul,aug,sep,oct,nov,dec,yearly median
0,Hanford,57232,257,0,18,40,199,1242,131,900,...,208596.0,209062.5,209226.0,209146.0,208906.5,209243.5,209646.0,210756.5,213125.5,209104.25
1,Redlands,71941,257,1,40,70,146,2108,330,1534,...,428854.5,430418.5,430988.5,431227.0,433337.5,435738.0,438494.0,439358.5,440008.5,431107.75
2,Glendale,202601,231,5,16,93,117,3305,480,2562,...,848568.5,849746.0,852006.0,848913.0,848468.0,850573.0,857070.0,860846.0,865203.0,851289.5
3,San Bernardino,216715,2858,46,140,906,1766,9081,2029,4974,...,271124.0,272944.0,273867.0,274452.0,275822.0,277041.0,278795.0,279903.0,281112.0,274159.5
4,Grass Valley,12919,73,3,0,6,64,463,76,326,...,405139.0,406816.5,405919.0,402539.5,400859.5,399217.0,398306.5,398893.5,401141.5,400702.0


In [None]:
# Split the Data into Training and Testing

In [None]:
# Create our features
X = pd.get_dummies(df.drop(columns='yearly median'))

# Create our target
y = y = df['yearly median']

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
# Prepare and Perform Random Forest Model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators =100, random_state=1)
random_forest.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
important_features = pd.Series(data=random_forest.feature_importances_,index=X.columns)
important_features.sort_values(ascending=False,inplace=True)
print(important_features)