# Random Forest Classifier
## Solar panel data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import ipywidgets as widgets
url_src = "data/rfc_sample2.csv"
df = pd.read_csv(url_src, parse_dates=True)
url_src = "data/rfc_model2.csv"
model = pd.read_csv(url_src, parse_dates=True)
manufacturer = df['module_manufacturer'].value_counts().reset_index()
manufacturer2 = model['module_manufacturer'].value_counts().reset_index()
mod = df['module_model'].value_counts().reset_index()
mod2 = model['module_model'].value_counts().reset_index()
manufacturer.columns = ['manufacturer', 'amount']
manufacturer2.columns = ['number', 'amount']
manufacturer['number'] = manufacturer2['number']
mod.columns = ['model', 'amount']
mod2.columns = ['number', 'amount']
mod['number'] = mod2['number']
unit_name = ['module_manufacturer','module_model']
print(manufacturer.shape)
print(mod.shape)
df = df.drop(['Unnamed: 0'], axis=1)
model = model.drop(['Unnamed: 0'], axis=1)

(167, 3)
(2649, 3)


In [2]:
pred = model.loc[model['year'] >= 2019]
pred = pred.drop(['stars'], axis=1)
len(pred)

56750

### This example RFC I chooce to use size, manufacturer and model.

In [3]:
model = model.loc[model['year'] < 2019]
ucol = ['total_installed_price','system_size_DC', 'module_manufacturer']
X = model[ucol]
y = model['stars']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
clf = RandomForestClassifier(n_estimators = 100) 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
ar_unique, i = np.unique(y_pred, return_counts=True)
# display the returned array
print("Unique values:", ar_unique)
# display the counts
print("Counts:", i)

Unique values: [1 2 3]
Counts: [ 9258 13261  6467]


In [4]:
model_df = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
model_df.value_counts()

Actual  Predicted
2       2            11225
1       1             7918
3       3             5927
1       2             1558
2       1             1333
        3              516
3       2              478
1       3               24
3       1                7
dtype: int64

### If the total installed price is leaved out accuracy drop under 90%

In [5]:
# Calculate the absolute errors
errors = abs(y_pred - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 5), '%.')

Mean Absolute Error: 0.14 degrees.
Accuracy: 90.70413 %.


### Test prediction in 2019 set

In [6]:
# pred = pred[['module_manufacturer_1','module_model_1','system_size_DC']]
predictions = clf.predict(pred[ucol])
ar_unique, i = np.unique(predictions, return_counts=True)
# display the returned array
print("Unique values:", ar_unique)
# display the counts
print("Counts:", i)

Unique values: [1 2 3]
Counts: [ 5881 41585  9284]


### Extra, link data to actual names

In [7]:
# Assing stars
pred['stars'] = predictions
# Link stars to module
best2 = pred['module_model'].loc[pred.stars == pred.stars.max()].unique()
b2_df = pd.DataFrame(columns=['model','amount','number'])
for b in best2:
    b2_df = pd.concat([b2_df, mod.loc[mod['number'] == b]], axis=0)
b2_df = b2_df.sort_values(by='amount',ascending=False).reset_index()
b2_df = b2_df.drop('index', axis=1)
link_models = df.loc[(df.year == 2019)&(df.module_model != 'Unknown')]
# Create filter list to models
filter_list = b2_df['model'].unique()
# Fecth data by filter list
best = link_models.loc[(link_models['module_model'].isin(filter_list))&(link_models.stars == 3)]
best = best.sort_values(by='p_s')
best[['module_manufacturer','module_model']].value_counts().head(20)

module_manufacturer                       module_model           
SANYO ELECTRIC CO LTD OF PANASONIC GROUP  VBHN330SA17                3079
LG Electronics Inc.                       LG335N1C-A5                1275
SunPower                                  SPR-X22-360-D-AC           1173
Hanwha Q CELLS                            Q.PEAK DUO-G5 325          1008
LG Electronics Inc.                       LG360Q1C-A5                 769
                                          LG335N1C-V5                 754
                                          LG340N1C-V5                 753
                                          LG330N1C-A5                 718
SANYO ELECTRIC CO LTD OF PANASONIC GROUP  VBHN335SA17                 677
SunPower                                  SPR-X22-360-E-AC            607
                                          SPR-A400-G-AC               573
                                          SPR-E20-327-E-AC            410
Solaria Corporation                       Sola

### Some thoughts
* Could it be usefull to buid some function?

[Link to data preparation](https://temppase.github.io/rfc_preparation/)

### Additional features
* posible function or functions to find other features
* Further development still under consideration

In [8]:
s = 'small'
l = 'large'
cols = ['module_manufacturer','module_model','built_in_meter_inverter','efficiency_module','p_s','system_size_DC','RES']
vcountcols = ['module_manufacturer','module_model','built_in_meter_inverter',
                     'efficiency_module','p_s','system_size_DC','RES','count']
mean = best.system_size_DC.mean()
def best_size_class(x,psfloor,psceiling,):
    if x == 'small':
        return best.loc[(best.p_s > int(psfloor))&(best.p_s < int(psceiling))&(best.system_size_DC < mean)]
    if x == 'large':
        return best.loc[(best.p_s > int(psfloor))&(best.p_s < int(psceiling))&(best.system_size_DC > mean)]

### Extra small

In [9]:
# Small
small = best_size_class(s,0,500)
small = small[cols].value_counts().to_frame().reset_index()
small.columns = vcountcols
extra_small = small.loc[small.system_size_DC < small.system_size_DC.mean()]
small = small.loc[small.system_size_DC > small.system_size_DC.mean()]
extra_small

Unnamed: 0,module_manufacturer,module_model,built_in_meter_inverter,efficiency_module,p_s,system_size_DC,RES,count
4,Hanwha Q CELLS,Q.PEAK DUO-G5 325,1.0,0.198655,280.77,5.2,1.0,1
5,Hanwha Q CELLS,Q.PEAK DUO-G5 325,1.0,0.198655,326.92,5.2,1.0,1
6,Hanwha Q CELLS,Q.PEAK DUO-G5 325,1.0,0.198655,354.03,5.525,1.0,1
10,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN325KA03,1.0,0.208869,439.56,4.55,1.0,1
11,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN330SA17,1.0,0.20625,337.88,3.3,1.0,1
12,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN330SA17,1.0,0.20625,413.22,3.63,1.0,1
13,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN335SA16,1.0,0.200087,316.07,5.695,1.0,1


### Small

In [10]:
small

Unnamed: 0,module_manufacturer,module_model,built_in_meter_inverter,efficiency_module,p_s,system_size_DC,RES,count
0,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN325KA03,1.0,0.208869,275.56,7.258041,1.0,2
1,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN325KA03,1.0,0.208869,279.72,7.15,1.0,2
2,Hanwha Q CELLS,Q.PEAK DUO-G5 325,0.0,0.198655,307.69,6.5,1.0,1
3,Hanwha Q CELLS,Q.PEAK DUO-G5 325,0.0,0.198655,307.69,6.825,1.0,1
7,LG Electronics Inc.,LG365Q1C-A5,1.0,0.218134,308.22,5.84,1.0,1
8,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN325KA03,1.0,0.208869,307.69,6.5,1.0,1
9,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN325KA03,1.0,0.208869,336.76,5.938898,1.0,1
14,Solaria Corporation,Solaria PowerXT-360R-PD,0.0,0.198895,365.5,6.84,1.0,1
15,SunPower,SPR-X22-360-E-AC,0.0,0.220859,439.2,6.48,1.0,1


### Large

In [11]:
# Large
large = best_size_class(l,0,500)
large = large[cols].value_counts().to_frame().reset_index()
large.columns = vcountcols
extra_large = large.loc[large.system_size_DC > large.system_size_DC.mean()]
large = large.loc[large.system_size_DC < large.system_size_DC.mean()]
large

Unnamed: 0,module_manufacturer,module_model,built_in_meter_inverter,efficiency_module,p_s,system_size_DC,RES,count
1,Hanwha Q CELLS,Q.PEAK DUO-G5 325,0.0,0.198655,461.54,9.75,1.0,1
2,Hanwha Q CELLS,Q.PEAK DUO-G5 325,1.0,0.198655,236.69,8.45,1.0,1
3,LG Electronics Inc.,LG335N1C-A5,1.0,0.20319,348.26,8.04,1.0,1
6,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN325KA03,1.0,0.208869,202.07,9.897429,1.0,1
7,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN325KA03,1.0,0.208869,205.13,9.75,1.0,1
8,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN325KA03,1.0,0.208869,252.58,7.918163,1.0,1
9,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN330SA17,1.0,0.20625,263.5,7.59,1.0,1
11,SunPower,SPR-X22-360-D-AC,0.0,0.220762,483.09,8.28,1.0,1
12,SunPower,SPR-X22-370-D-AC,0.0,0.226994,343.98,8.14,1.0,1


### Extra large

In [12]:
extra_large

Unnamed: 0,module_manufacturer,module_model,built_in_meter_inverter,efficiency_module,p_s,system_size_DC,RES,count
0,Hanwha Q CELLS,Q.PEAK DUO-G5 325,0.0,0.198655,307.69,11.7,1.0,1
4,LG Electronics Inc.,LG340N1C-V5,0.0,0.206223,474.38,21.08,1.0,1
5,SANYO ELECTRIC CO LTD OF PANASONIC GROUP,VBHN325KA03,1.0,0.208869,173.2,11.547184,1.0,1
10,SunPower,SPR-X22-360-D-AC,0.0,0.220762,471.09,12.736286,1.0,1


### Notifications
* Larger the system size -> smaller the price/kW also high efficieny rise price/kW. Some exeptions will be found.