In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# Read the csv file into a pandas DataFrame

mdf = pd.read_csv('../../data/CLEANED_merged_state_pop_msfhs.csv')
# drop states without complete data
mdf = mdf.dropna()
mdf.head()

In [None]:
# create a list of available states
state_list = mdf['State'].tolist()
print(state_list)
print(len(state_list))

In [None]:
# create model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:

# create empty list to hold outputs
list_prediction_lists = []

# for loop goes through each row (state) to format data
for state in state_list:
    # gets each state's row
    st_df = mdf.loc[mdf['State'] == state ]
    # creates df for percent pop change
    pop_change = st_df[['PercentPopChg_96', 'PercentPopChg_97', 'PercentPopChg_98', 'PercentPopChg_99', 'PercentPopChg_2000',
                   'PercentPopChg_2001', 'PercentPopChg_2002', 'PercentPopChg_2003', 'PercentPopChg_2004', 'PercentPopChg_2005',
                   'PercentPopChg_2006', 'PercentPopChg_2007', 'PercentPopChg_2009', 'PercentPopChg_2010', 'PercentPopChg_2011', 
                   'PercentPopChg_2012', 'PercentPopChg_2013', 'PercentPopChg_2014', 'PercentPopChg_2015', 'PercentPopChg_2016', 
                   'PercentPopChg_2017', 'PercentPopChg_2018','PercentPopChg_2019' ]]
    # creates df for median home sales
    mh = st_df[['1996', '1997', '1998', '1999', '2000',
                   '2001', '2002', '2003', '2004', '2005',
                   '2006', '2007', '2009', '2010', '2011', 
                   '2012', '2013', '2014', '2015', '2016', 
                   '2017', '2018', '2019']]
    
    # creates list with incremental changes to use as years
    index_list = list(range(1, 24))
    # create list for pop change
    pop_change_clean = pop_change.iloc[0].tolist()
    # get average pop change
    avg_pop_change = np.mean(pop_change_clean)
#     print(avg_pop_change)
#     print(pop_change_clean)
    # get list for median home sales
    mh_list = mh.iloc[0].tolist()
#     print(mh_list)
#     print(index_list)
    
    # creates df of data for each state
    model_df = pd.DataFrame({"Index": index_list, 
                               "Median_home": mh_list,
                               "Pop_Change": pop_change_clean})
    # get list of years to predict. Last value is 2019, so we starte with 2020
    prediction_year = [max(model_df['Index'])+1, max(model_df['Index'])+2, max(model_df['Index'])+3, max(model_df['Index'])+4]
    # create list to hold predictions, state and r2
    prediction_list = []
    # append state name
    prediction_list.append(state)
    # list to hold r2 values (all r2 values are the same for each state)
    score_list = []
    # calculate prediction for each year
    for year in prediction_year:
        # set up model and fit to state's df  
        x = model_df[["Index", "Pop_Change"]]
        y = model_df["Median_home"].values.reshape(-1, 1)
        model.fit(x, y)
        # get r2
        score = model.score(x, y)
        # get prediction for each year using the average pop change
        prediction = model.predict([[year, avg_pop_change]])
        # append predition and score to list
        prediction_list.append(int(round(prediction[0][0])))
        score_list.append(round(model.score(x, y),3))
#     print(prediction_list)
#     print(score_list)
    
    # append score to prediction list and append list to master list
    prediction_list.append(score_list[0])
    list_prediction_lists.append(prediction_list)

    
# print(list_prediction_lists)

# using the list of prediction lists, use each list as a new row of a dataframe
## Life saver ==> https://stackoverflow.com/questions/42202872/how-to-convert-list-to-row-dataframe-with-pandas
predictions_df = pd.DataFrame(columns=['State', '2020_prediction', '2021_prediction', 
                                       '2022_prediction', '2023_prediction', 'R_Squared'], data=list_prediction_lists)


predictions_df.head(50)


In [None]:
predictions_df.to_csv("../../data/MLR_HousingValues_PopChange_output.csv")

In [None]:
# Testing code

# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# # from sklearn.linear_model import LinearRegression

# x = model_df[["Index", "Pop_Change"]]
# y = model_df["Median_home"].values.reshape(-1, 1)
# print(X.shape, y.shape)

# # model = LinearRegression()
# # model.fit(x, y)
# # score = model.score(x, y)
# # predictions = model.predict([[month]])

# # print(f"Score: {score}")

# Plot the Residuals for the Training and Testing data


# plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
# plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")
# plt.legend()
# plt.hlines(y=0, xmin=y.min(), xmax=y.max())
# plt.title("Residual Plot")
