In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from config import db_password

In [2]:
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

# Run sqlalchemy to import data and process for machine learning

In [3]:
db_string = f"postgresql://postgres:{db_password}@database-3.csjzcnuh5mqe.us-east-2.rds.amazonaws.com:5432/Final Project"

In [None]:
engine = create_engine(db_string)
conn = engine.connect()

In [None]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [None]:
# Create session (link) from Python to the DB
session = Session(engine)

In [None]:
#import from SQL
final_sql = 'SELECT * from Final'

In [None]:
final_sql_df= pd.read_sql(final_sql, conn)

In [None]:
final_sql_df.head(10)

In [None]:
final_sql_df.columns

In [None]:
#find and create column for yearly median house price
yearly_median = final_sql_df.groupby('city')[['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
       'aug', 'sep', 'oct', 'nov', 'dec']].apply(np.median)

In [None]:
yearly_median

In [None]:
yearly_median_df = pd.DataFrame(yearly_median)
yearly_median_df.columns = ['yearly median']

yearly_median_df = pd.DataFrame.reset_index(yearly_median_df)

yearly_median_df.head(10)

In [None]:
# Merge yearly_median_df into df with the final_sql_df
final_analysis_df = pd.merge(final_sql_df, yearly_median_df, on='city')

final_analysis_df.head()

In [None]:
# final_analysis_df.to_csv('final_analysis_df.csv')

# Shortcut for personal use offline

In [8]:
final_analysis_df= pd.read_csv('../Resources/final_analysis_df.csv')
final_analysis_df.head()

Unnamed: 0.1,Unnamed: 0,city,population,violentcrime,murder,rape,robbery,aggassault,propertycrime,burglary,...,apr,may,jun,jul,aug,sep,oct,nov,dec,yearly median
0,0,Hanford,57232,257,0,18,40,199,1242,131,...,208596.0,209062.5,209226.0,209146.0,208906.5,209243.5,209646.0,210756.5,213125.5,209104.25
1,1,Redlands,71941,257,1,40,70,146,2108,330,...,428854.5,430418.5,430988.5,431227.0,433337.5,435738.0,438494.0,439358.5,440008.5,431107.75
2,2,Glendale,202601,231,5,16,93,117,3305,480,...,848568.5,849746.0,852006.0,848913.0,848468.0,850573.0,857070.0,860846.0,865203.0,851289.5
3,3,San Bernardino,216715,2858,46,140,906,1766,9081,2029,...,271124.0,272944.0,273867.0,274452.0,275822.0,277041.0,278795.0,279903.0,281112.0,274159.5
4,4,Grass Valley,12919,73,3,0,6,64,463,76,...,405139.0,406816.5,405919.0,402539.5,400859.5,399217.0,398306.5,398893.5,401141.5,400702.0


# Machine learning setup
## Split the Data into Training and Testing

In [None]:
# Create our features
X = pd.get_dummies(final_analysis_df.drop(columns=''))

# Create our target
y = y = final_analysis_df['']

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

## Prepare and Perform Random Forest Model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators =100, random_state=1)
random_forest.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
important_features = pd.Series(data=random_forest.feature_importances_,index=X.columns)
important_features.sort_values(ascending=False,inplace=True)
print(important_features)

## Easy Ensemble AdaBoost Classifier

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
ensemble_model = EasyEnsembleClassifier(n_estimators =100, random_state=1)
ensemble_model.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = ensemble_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))