In [1]:
# Import Dependencies
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# Read data from csv
full_df = pd.read_csv("full_df.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'full_df.csv'

In [None]:
# Display full_df
full_df.head(10)

In [None]:
#check types
full_df.dtypes

In [None]:
# view columns
full_df.columns

In [None]:
# Check Record Count (should = 5680 rows)
len(full_df.index)

In [None]:
# Determine Top 5 Worst AQI Cities
Worst_AQI = full_df.groupby(["Year","CBSA"])["Median AQI"].mean().sort_values(ascending=False)
Worst_AQI.head(5)

In [None]:
# Determine Top 5 Best AQI Cities
Best_AQI = full_df.groupby(["Year","CBSA"])["Median AQI"].mean().sort_values(ascending=True)
Best_AQI.head(5)

In [None]:
#view as dataframe
combined_AQI1 = pd.DataFrame(Worst_AQI)
combined_AQI1.head(10)

In [None]:
#clean up data frame by resetting index
combined_AQI= combined_AQI1.reset_index()
combined_AQI.head()

In [None]:
#view outlier values
combined_AQI.groupby("CBSA").size().sort_values(ascending=True).head(20)


In [None]:
#drop outlier values less than 2
combined_AQI.drop(combined_AQI[combined_AQI['CBSA'] =="Laurel, MS"].index,inplace=True)
combined_AQI.drop(combined_AQI[combined_AQI['CBSA'] =="Russellville, AR"].index,inplace=True)
combined_AQI.drop(combined_AQI[combined_AQI['CBSA'] =="Elmira, NY"].index,inplace=True)
combined_AQI.drop(combined_AQI[combined_AQI['CBSA'] =="Breckenridge, CO"].index,inplace=True)
combined_AQI.drop(combined_AQI[combined_AQI['CBSA'] =="Shawnee, OK"].index,inplace=True)
combined_AQI.drop(combined_AQI[combined_AQI['CBSA'] =="Helena-West Helena, AR"].index,inplace=True)
combined_AQI.drop(combined_AQI[combined_AQI['CBSA'] =="Kingston, NY"].index,inplace=True)
combined_AQI.drop(combined_AQI[combined_AQI['CBSA'] =="Vermillion, SD"].index,inplace=True)
combined_AQI.drop(combined_AQI[combined_AQI['CBSA'] =="Cleveland, TN"].index,inplace=True)
combined_AQI.drop(combined_AQI[combined_AQI['CBSA'] =="Ada, OK"].index,inplace=True)

In [None]:
# add CBSA columns back into dataframe
combined_AQI_CBSA = combined_AQI['CBSA']

In [None]:
# create one hot encoding
combined_AQI = pd.get_dummies(combined_AQI, prefix="city", columns=["CBSA"])

In [None]:
#confirm the cities have been broken out
combined_AQI.describe()

In [None]:
combined_AQI.info()

In [None]:
# Break into targets and features
X = combined_AQI.drop("Median AQI", axis=1)
y = combined_AQI['Median AQI']

In [None]:
# view columns
X.columns

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
#import linear regression dependencies
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

In [None]:
#training linear regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
# View data columns
X.columns

In [None]:
# matched to coefficients of the numbers below
X.iloc[0]

In [None]:
#print the coefficients of the model
print('Coefficients: \n', lin_reg.coef_)

In [None]:
#predict X_test set from data
predictions = lin_reg.predict(X_test)

In [None]:
#check size
y_test.shape

In [None]:
#real test values versus predicted values
plt.scatter(y_test, predictions)
plt.xlabel('Y_test')
plt.ylabel('Predicted Y')

In [None]:
#evaluate the model
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
#check for distribution
plt.hist((y_test-predictions),bins=50);

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X, y)
predictions = lin_reg.predict(X)
predictions

In [None]:
# put predictions and CBSA into columns
combined_AQI["Predicted AQI"]=predictions
combined_AQI["CBSA"]=combined_AQI_CBSA

In [None]:
# view columns
X.columns

In [None]:
# Organize new df
combined_AQI = combined_AQI[["Year","CBSA","Median AQI", "Predicted AQI"]]

In [None]:
# call out bakersfield, CA
combined_AQI.loc[combined_AQI["CBSA"]=='Bakersfield, CA']

In [None]:
# predict all the data instead of only samples
predictions = lin_reg.predict([X.iloc[0]])
predictions

In [None]:
# begin predictions by pulling the most and least polluted cities
input_AZ = X.iloc[0]
#input_Riverside = X.iloc[0]
# input_Bakersfield = X.iloc[0]
# input_LA = X.iloc[0]
# input_Porterville = X.iloc[0]
# input_VA = X.iloc[0]
# input_MI = X.iloc[0]
# input_NM = X.iloc[0]
# input_ND = X.iloc[0]
# input_ID = X.iloc[0]

In [None]:
# predict next years pollution rates
input_AZ['Year']=2023
#input_Riverside['Year']=2023
# input_Bakersfield['Year']=2023
# input_LA['Year']=2023
# input_Porterville['Year']=2023
# input_VA['Year']=2023
# input_MI['Year']=2023
# input_NM['Year']=2023
# input_ND['Year']=2023
# input_ID['Year']=2023

In [None]:
#input_AZ,  input_Bakersfield, input_LA, input_Porterville, input_VA, input_MI, input_NM, input_ND, input_ID
predictions = lin_reg.predict([input_AZ])
predictions

In [None]:
#show results
# plt.scatter(y_test, predictions)
# plt.xlabel('Y_test')
# plt.ylabel('Predicted Y')

In [None]:
#X_axis = [input_AZ= ,input_Riverside=83.92821184, ]

In [None]:
#review the coefficients
# coeffecients = pd.DataFrame(lin_reg.coef_,X)
# coeffecients.columns = ['Coeffecient']
# coeffecients