# Solar Analysis

### Import, read, display:

In [26]:
# solar data
# fill in 0s for sq_ft, hrs_available, since impt to see if solar viable option
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [27]:
solar_data = pd.read_csv('solar_data.csv')
census_data = pd.read_csv('census_data.csv')
gas_data = pd.read_csv('gas_meter_data.csv')

KeyboardInterrupt: 

In [None]:
print('HEAD', solar_data.head())
#other things

HEAD    house_id  device_id  gas_meter_id        lat       long  \
0  b1eb5eb2   83771758           NaN  42.191697 -72.613522   
1  b1eb5eb2   69525019           NaN  42.191697 -72.613522   
2  a90ea0b3    5827516    54363558.0  42.211167 -72.621543   
3  049bf03c   83770216    54459969.0  42.211157 -72.621317   
4  b2dc9fbe   56976532           NaN  42.210828 -72.621242   

   hours_of_sunlight_per_year  sq_ft_available_for_solar_panels  
0                        1404                              3207  
1                           0                                 0  
2                        1488                              1180  
3                        1504                              1163  
4                        1383                               846  


In [None]:
print('INFO', solar_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17052 entries, 0 to 17051
Data columns (total 7 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   house_id                          17052 non-null  object 
 1   device_id                         17052 non-null  int64  
 2   gas_meter_id                      10162 non-null  float64
 3   lat                               17052 non-null  float64
 4   long                              17052 non-null  float64
 5   hours_of_sunlight_per_year        17052 non-null  int64  
 6   sq_ft_available_for_solar_panels  17052 non-null  int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 932.7+ KB
INFO None


In [None]:
print('DESC', solar_data.describe())

DESC           device_id  gas_meter_id           lat          long  \
count  1.705200e+04  1.016200e+04  17052.000000  17052.000000   
mean   4.486838e+07  5.648602e+07     42.202733    -72.623131   
std    4.449656e+07  1.977618e+07      0.069791      0.545868   
min    5.819832e+06  2.029568e+07     40.771841    -73.501843   
25%    2.039324e+07  4.626168e+07     42.194981    -72.635908   
50%    3.105131e+07  4.976170e+07     42.203596    -72.624741   
75%    5.676430e+07  5.635645e+07     42.210083    -72.615631   
max    3.383795e+08  1.007938e+08     50.929574     -1.391057   

       hours_of_sunlight_per_year  sq_ft_available_for_solar_panels  
count                17052.000000                      17052.000000  
mean                   730.968156                        885.132301  
std                    685.065756                       3563.142914  
min                      0.000000                          0.000000  
25%                      0.000000                          

### KNN Imputation:

#### 1. Data processing:

In [None]:
# remove independent variable columns
solar_data_dv = solar_data.drop(['device_id', 'gas_meter_id'], axis=1)

# detect missing values
print('missing:') 
print(solar_data_dv.eq(0).sum(), '\n')

#how many not missing
print('not missing:') 
print(solar_data_dv.ne(0).sum())

missing:
house_id                               0
lat                                    0
long                                   0
hours_of_sunlight_per_year          7910
sq_ft_available_for_solar_panels    9124
dtype: int64 

not missing:
house_id                            17052
lat                                 17052
long                                17052
hours_of_sunlight_per_year           9142
sq_ft_available_for_solar_panels     7928
dtype: int64


In [None]:
# add new columns - aggregate time meter data to compare, assuming house size/roof space roughly correlated w energy usage 


In [None]:
# only want to do knn on houses we care about (those that use gas) to save time
# find gas houses w/ at least 1 missing value

knn_gas_house_ids = []

for solar_row in range(len(solar_data.index)):
       if not pd.isna(solar_data.loc[solar_row, 'gas_meter_id']):
              if solar_data.loc[solar_row, 'hours_of_sunlight_per_year'] == 0 or solar_data.loc[solar_row, 'sq_ft_available_for_solar_panels'] == 0:
                     knn_gas_house_ids.append(solar_data.loc[solar_row, 'house_id'])

print(knn_gas_house_ids)

['ac2f7c77', '7407e44b', '8c46fac9', 'efd90c76', '4ca0c4ec', '6e369e62', 'cdaaab25', '314fdb11', '50ee1f40', 'd5916302', 'e48445a5', '50d1ca84', '3d870d2d', 'c296901c', '1bc7dca4', 'bfeec959', 'c6ba88aa', 'e06f63d8', '9cad1cfb', '4e4e3620', 'f2a9bc79', 'c4f30eeb', '3df911e8', 'db3a3c07', '136cad8d', '54df8e88', 'f2edc9bb', 'fd612a67', '6680db42', '8c56d780', 'e35ae366', 'caf31495', '571af134', 'e912d342', 'e0038fe5', '47a5aa18', '47a5aa18', '9b41e5eb', '8d0114e9', '1321f878', 'ef660a48', '05e7052a', 'b37dab6a', 'b37dab6a', '32c70da1', 'fc8cb7e3', 'fc8cb7e3', 'fc8cb7e3', 'ad17ec75', '9b79274a', '8aa67b90', '14f1467f', '5e15e87d', '1a96a7fd', '77e5c68b', '7edbe29c', '67741535', '67741535', 'e016da5f', 'd5827bf8', '1e39428e', '13e70e11', '44919fb4', '4403e20e', '45e4f977', '845c3aa6', '73c49b90', '111137ef', '7de0873b', '82e577e5', '08fb36f2', '5c40e192', '4c491877', '80ba20f1', '3710296d', 'f3fa4371', 'd666408b', '3bd293e6', 'f28656c7', 'e14301b8', 'a6d4e122', '79ee847d', '6f24c9c4', 'bf

In [None]:
# add new columns - aggregate time meter data to compare, assuming house size/roof space roughly correlated w energy usage 
