# Supervised Learning Project

In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline        

import numpy as np
import pandas as pd
import zipfile

from sklearn.datasets import make_circles, load_boston
from sklearn.model_selection import train_test_split as tts

from sklearn.linear_model import LinearRegression as LinReg
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet


from sklearn.svm import SVR
from sklearn.tree import ExtraTreeRegressor as ETR

from sklearn.linear_model import SGDRegressor as SGDR
from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.ensemble import GradientBoostingRegressor as GBR

In [3]:
#Importando datos
zf = zipfile.ZipFile('../solar-energy-prediction-datamex0320.zip')
traindf = pd.read_csv(zf.open('solar_train.csv'))
traindf.head()

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,1475175023,9/29/2016 12:00:00 AM,08:50:23,634.99,61,30.46,41,14.96,6.75,06:13:00,18:13:00
1,1481799902,12/15/2016 12:00:00 AM,01:05:02,1.27,37,30.26,70,207.43,5.62,06:50:00,17:46:00
2,1478339417,11/4/2016 12:00:00 AM,23:50:17,1.21,47,30.49,33,168.2,5.62,06:25:00,17:47:00
3,1472887208,9/2/2016 12:00:00 AM,21:20:08,1.67,54,30.46,101,152.6,3.37,06:07:00,18:37:00
4,1478724901,11/9/2016 12:00:00 AM,10:55:01,839.78,62,30.47,36,291.95,7.87,06:28:00,17:45:00


In [4]:
testdf = pd.read_csv(zf.open('solar_test.csv'))


In [39]:
test = pd.read_csv(zf.open('solar_test.csv'))

In [5]:
traindf["Data"] = pd.to_datetime(traindf["Data"]) 

In [6]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24514 entries, 0 to 24513
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   UNIXTime                24514 non-null  int64         
 1   Data                    24514 non-null  datetime64[ns]
 2   Time                    24514 non-null  object        
 3   Radiation               24514 non-null  float64       
 4   Temperature             24514 non-null  int64         
 5   Pressure                24514 non-null  float64       
 6   Humidity                24514 non-null  int64         
 7   WindDirection(Degrees)  24514 non-null  float64       
 8   Speed                   24514 non-null  float64       
 9   TimeSunRise             24514 non-null  object        
 10  TimeSunSet              24514 non-null  object        
dtypes: datetime64[ns](1), float64(4), int64(3), object(3)
memory usage: 2.1+ MB


In [7]:
traindf["TimeSunRise"] = pd.to_datetime(traindf["TimeSunRise"]) 

In [8]:
traindf["TimeSunSet"] = pd.to_datetime(traindf["TimeSunSet"]) 

In [9]:
traindf['H2'] = traindf.TimeSunSet.dt.hour

In [10]:
traindf = traindf.drop(columns= ['Data'])
traindf = traindf.drop(columns= ['TimeSunRise'])
traindf = traindf.drop(columns= ['TimeSunSet'])
traindf = traindf.drop(columns= ['Time'])

In [11]:
traindf.head()

Unnamed: 0,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,H2
0,1475175023,634.99,61,30.46,41,14.96,6.75,18
1,1481799902,1.27,37,30.26,70,207.43,5.62,17
2,1478339417,1.21,47,30.49,33,168.2,5.62,17
3,1472887208,1.67,54,30.46,101,152.6,3.37,18
4,1478724901,839.78,62,30.47,36,291.95,7.87,17


In [12]:
traindf["UNIXTime"] = traindf["UNIXTime"].astype(float)
traindf["Temperature"] = traindf["Temperature"].astype(float)
traindf["Humidity"] = traindf["Humidity"].astype(float)
traindf["H2"] = traindf["H2"].astype(float)

In [13]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24514 entries, 0 to 24513
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   UNIXTime                24514 non-null  float64
 1   Radiation               24514 non-null  float64
 2   Temperature             24514 non-null  float64
 3   Pressure                24514 non-null  float64
 4   Humidity                24514 non-null  float64
 5   WindDirection(Degrees)  24514 non-null  float64
 6   Speed                   24514 non-null  float64
 7   H2                      24514 non-null  float64
dtypes: float64(8)
memory usage: 1.5 MB


In [14]:
traindf.corr()

Unnamed: 0,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,H2
UNIXTime,1.0,-0.08191,-0.369919,-0.331841,-0.061548,0.156535,0.17396,-0.820021
Radiation,-0.08191,1.0,0.736029,0.116763,-0.226432,-0.238388,0.07458,0.052021
Temperature,-0.369919,0.736029,1.0,0.310439,-0.281192,-0.27007,-0.032578,0.305401
Pressure,-0.331841,0.116763,0.310439,1.0,-0.22248,-0.229222,-0.082508,0.153105
Humidity,-0.061548,-0.226432,-0.281192,-0.22248,1.0,0.001631,-0.211095,0.145714
WindDirection(Degrees),0.156535,-0.238388,-0.27007,-0.229222,0.001631,1.0,0.071282,-0.084521
Speed,0.17396,0.07458,-0.032578,-0.082508,-0.211095,0.071282,1.0,-0.15947
H2,-0.820021,0.052021,0.305401,0.153105,0.145714,-0.084521,-0.15947,1.0


In [15]:
traindf.isnull().sum()

UNIXTime                  0
Radiation                 0
Temperature               0
Pressure                  0
Humidity                  0
WindDirection(Degrees)    0
Speed                     0
H2                        0
dtype: int64

In [17]:
testdf["TimeSunSet"] = pd.to_datetime(testdf["TimeSunSet"]) 
testdf['H2'] = testdf.TimeSunSet.dt.hour

In [18]:
testdf = testdf.drop(columns= ['Data'])
testdf = testdf.drop(columns= ['TimeSunRise'])
testdf = testdf.drop(columns= ['TimeSunSet'])
testdf = testdf.drop(columns= ['Time'])

In [19]:
testdf.head()

Unnamed: 0,id,UNIXTime,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,H2
0,0,1478720107,59,30.47,44,312.67,3.37,17
1,1,1474063503,59,30.48,83,38.01,6.75,18
2,2,1476109221,47,30.39,78,213.62,5.62,18
3,3,1481475056,45,30.4,98,176.63,4.5,17
4,4,1477493117,45,30.4,34,175.89,6.75,17


In [47]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8172 entries, 0 to 8171
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   UNIXTime                8172 non-null   int64  
 1   Temperature             8172 non-null   int64  
 2   Pressure                8172 non-null   float64
 3   Humidity                8172 non-null   int64  
 4   WindDirection(Degrees)  8172 non-null   float64
 5   Speed                   8172 non-null   float64
 6   H2                      8172 non-null   int64  
dtypes: float64(3), int64(4)
memory usage: 447.0 KB


In [31]:
testdf = testdf.drop(columns= ['id'])

In [20]:
X=traindf.drop('Radiation', axis=1)
y=traindf.Radiation


In [21]:

X_train, X_test, y_train, y_test = tts(X, y)

In [22]:
linreg=LinReg(normalize=True)

linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [23]:
train_score=linreg.score(X_train, y_train)   # R2
test_score=linreg.score(X_test, y_test)

print (train_score, test_score)

0.5950715052571554 0.6104175055735168


In [24]:
knnr=KNNR(n_neighbors=7)
knnr.fit(X_train, y_train)

train_score=knnr.score(X_train, y_train) #R2
test_score=knnr.score(X_test, y_test)

print ('train R2:',train_score, '-- test R2:', test_score)

train R2: 0.9549811733095281 -- test R2: 0.9468674174757591


In [32]:
y_pred=knnr.predict(testdf)

In [34]:
y_pred.shape

(8172,)

In [27]:
from sklearn.metrics import mean_squared_error as mse

#mse(X_test, y_pred)

In [28]:
gbr=GBR()
gbr.fit(X_train, y_train)

train_score=gbr.score(X_train, y_train) #R2
test_score=gbr.score(X_test, y_test)
gbr=GBR()
gbr.fit(X_train, y_train)

train_score=gbr.score(X_train, y_train) #R2
test_score=gbr.score(X_test, y_test)

print ('train R2:',train_score, '-
print ('train R2:',train_score, '-- test R2:', test_score)

train R2: 0.7705325058997287 -- test R2: 0.7716080534424863


In [33]:
y_pred1=gbr.predict(testdf)

In [35]:
y_pred1.shape

(8172,)

In [36]:
Radiation = pd.DataFrame(y_pred)

In [37]:
Radiation.rename(columns={list(Radiation)[0]:'Radiation'}, inplace=True)
Radiation.columns

Index(['Radiation'], dtype='object')

In [41]:
df = pd.DataFrame(columns=[test["id"], Radiation['Radiation']])

In [42]:
df = df.T

In [43]:
df.reset_index(inplace=True) 
df 


Unnamed: 0,id,Radiation
0,0,691.767143
1,1,1042.315714
2,2,1.234286
3,3,17.177143
4,4,1.215714
...,...,...
8167,8167,478.340000
8168,8168,1.237143
8169,8169,12.105714
8170,8170,1.217143


In [44]:
df.to_csv('predicts1.csv', index=False)

In [46]:
df1= pd.read_csv('predicts1.csv')
df1

Unnamed: 0,id,Radiation
0,0,691.767143
1,1,1042.315714
2,2,1.234286
3,3,17.177143
4,4,1.215714
...,...,...
8167,8167,478.340000
8168,8168,1.237143
8169,8169,12.105714
8170,8170,1.217143
