# Data Analysis: See DataCollection.ipynb First

### Set up Imports and Import Data
Uses:
- rpy2: to utilize R to run analysis and create models
- pandas: handle df operations
- datetime: to calculate dates/perform operations

In [None]:
import pandas as pd
import rpy2
import datetime
%load_ext rpy2.ipython

# Things to add still
## Check P Values and group Land Size by additional unit (10, 100?)
## Create a predicted column based upon the given fields, calculate Deltas and visualize data/errors
## See what % is fair valued vs not

### Grab the file and import Data

In [None]:
# allows dynamic naming of files to access different ones depending on year/purpose
file_name = str(input('Enter name of CSV in current working directory (include .csv): '))
df = pd.read_csv(file_name)

# grabs year to use to calculate "years old" in the df
year = datetime.datetime.today().year
print(year)

In [None]:
# creates 2 new df's, one for lots with houses and one for lots with only land
df_land = df.loc[df['improved_value'] == 0.0]
df_houses = df.loc[df['improved_value'] != 0.0]

# fills the years columns with years old
df_houses['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(0).astype(int)
df_houses['years_old'] = year - df_houses['year']


# export these two dfs so we can use them in the R language below
df_houses.to_csv(f'Data/{file_name[:4]}_Houses.csv') # use dynamic filename to name our data_file
df_land.to_csv(f'Data/{file_name[:4]}_Land.csv') # use dynamic filename to name our data_file

### Now using R to import files and look at the data

In [None]:
%%R

# imports for R
library(ggplot2)
library(tidyverse)
library(moderndive)

options(scipen = 999) # sets the output formatting for numbers



In [None]:
%%R
# read in files
houses = read.csv('Data/2023_Houses.csv')
land = read.csv('Data/2023_Land.csv')

#make sure the year is at the oldest entry to prevent outliers or mistyped data
houses_1984 = houses %>%
    filter(year > 1984)

In [None]:
%%R

# create dummy variables for our house classes since the county identifies them accordingly
houses_1984 = houses_1984 %>%
    mutate(isBH14 = ifelse(bldg_class == 'BH14', 1, 0),
          isBH12 = ifelse(bldg_class == 'BH12', 1,0))

head(houses_1984)


In [None]:
%%R
# take a peak
head(houses)

### Create Models

In [None]:
%%R

# handle outlier data 
threshold_year = 2023 - 1984

# start running regression on these houses to see if any trends emerge
# lm_houses = lm(appraised ~ land_value + land_size + imp_sqft + years_old, data = subset(houses, years_old < threshold_year))
lm_houses = lm(appraised ~ land_size + imp_sqft + years_old, data = houses)
#lm_houses_1984 = lm(appraised ~ land_size + imp_sqft + years_old + bldg_class, data = houses_1984)
# head(houses)

In [None]:
%%R

# look at the coefs of my models
coef(lm_houses)
coef(lm_houses_1984)#first regression

In [None]:
%%R
# create a new col to see the error in our model or the difference at every entry
houses = houses %>%
    mutate(predicted_app_value = predict(lm_houses, newdata= houses),
           delta = predicted_app_value - appraised)

# look at the coefs
coef(lm_houses)
    

In [None]:
%%R

# look at the significance of each coefficient to see if it can be left in the model
options(scipen = 999)
get_regression_table(lm_houses_1984)