# Supervised Learning Project

In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline        

import numpy as np
import pandas as pd
import zipfile

from sklearn.datasets import make_circles, load_boston
from sklearn.model_selection import train_test_split as tts

from sklearn.linear_model import LinearRegression as LinReg
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet


from sklearn.svm import SVR
from sklearn.tree import ExtraTreeRegressor as ETR

from sklearn.linear_model import SGDRegressor as SGDR
from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.ensemble import GradientBoostingRegressor as GBR

In [3]:
#Importando datos
zf = zipfile.ZipFile('../solar-energy-prediction-datamex0320.zip')
traindf = pd.read_csv(zf.open('solar_train.csv'))
traindf.head()

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,1475175023,9/29/2016 12:00:00 AM,08:50:23,634.99,61,30.46,41,14.96,6.75,06:13:00,18:13:00
1,1481799902,12/15/2016 12:00:00 AM,01:05:02,1.27,37,30.26,70,207.43,5.62,06:50:00,17:46:00
2,1478339417,11/4/2016 12:00:00 AM,23:50:17,1.21,47,30.49,33,168.2,5.62,06:25:00,17:47:00
3,1472887208,9/2/2016 12:00:00 AM,21:20:08,1.67,54,30.46,101,152.6,3.37,06:07:00,18:37:00
4,1478724901,11/9/2016 12:00:00 AM,10:55:01,839.78,62,30.47,36,291.95,7.87,06:28:00,17:45:00


In [4]:
testdf = pd.read_csv(zf.open('solar_test.csv'))


In [5]:
test = pd.read_csv(zf.open('solar_test.csv'))

In [6]:
traindf["Data"] = pd.to_datetime(traindf["Data"]) 

In [7]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24514 entries, 0 to 24513
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   UNIXTime                24514 non-null  int64         
 1   Data                    24514 non-null  datetime64[ns]
 2   Time                    24514 non-null  object        
 3   Radiation               24514 non-null  float64       
 4   Temperature             24514 non-null  int64         
 5   Pressure                24514 non-null  float64       
 6   Humidity                24514 non-null  int64         
 7   WindDirection(Degrees)  24514 non-null  float64       
 8   Speed                   24514 non-null  float64       
 9   TimeSunRise             24514 non-null  object        
 10  TimeSunSet              24514 non-null  object        
dtypes: datetime64[ns](1), float64(4), int64(3), object(3)
memory usage: 2.1+ MB


In [8]:
traindf["TimeSunRise"] = pd.to_datetime(traindf["TimeSunRise"]) 

In [9]:
traindf['H1'] = traindf.TimeSunRise.dt.hour

In [10]:
traindf["TimeSunSet"] = pd.to_datetime(traindf["TimeSunSet"]) 

In [11]:
traindf['H2'] = traindf.TimeSunSet.dt.hour

In [12]:
traindf = traindf.drop(columns= ['Data'])
traindf = traindf.drop(columns= ['TimeSunRise'])
traindf = traindf.drop(columns= ['TimeSunSet'])
traindf = traindf.drop(columns= ['Time'])


In [13]:
traindf.head()

Unnamed: 0,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,H1,H2
0,1475175023,634.99,61,30.46,41,14.96,6.75,6,18
1,1481799902,1.27,37,30.26,70,207.43,5.62,6,17
2,1478339417,1.21,47,30.49,33,168.2,5.62,6,17
3,1472887208,1.67,54,30.46,101,152.6,3.37,6,18
4,1478724901,839.78,62,30.47,36,291.95,7.87,6,17


In [14]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24514 entries, 0 to 24513
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   UNIXTime                24514 non-null  int64  
 1   Radiation               24514 non-null  float64
 2   Temperature             24514 non-null  int64  
 3   Pressure                24514 non-null  float64
 4   Humidity                24514 non-null  int64  
 5   WindDirection(Degrees)  24514 non-null  float64
 6   Speed                   24514 non-null  float64
 7   H1                      24514 non-null  int64  
 8   H2                      24514 non-null  int64  
dtypes: float64(4), int64(5)
memory usage: 1.7 MB


In [15]:
traindf['Sun_Hours'] = traindf['H2'] - traindf['H1'] 

In [16]:
traindf = traindf.drop(columns= ['H2'])
traindf = traindf.drop(columns= ['H1'])

In [17]:
traindf["UNIXTime"] = traindf["UNIXTime"].astype(float)
traindf["Temperature"] = traindf["Temperature"].astype(float)
traindf["Humidity"] = traindf["Humidity"].astype(float)
traindf["Sun_Hours"] = traindf["Sun_Hours"].astype(float)



In [18]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24514 entries, 0 to 24513
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   UNIXTime                24514 non-null  float64
 1   Radiation               24514 non-null  float64
 2   Temperature             24514 non-null  float64
 3   Pressure                24514 non-null  float64
 4   Humidity                24514 non-null  float64
 5   WindDirection(Degrees)  24514 non-null  float64
 6   Speed                   24514 non-null  float64
 7   Sun_Hours               24514 non-null  float64
dtypes: float64(8)
memory usage: 1.5 MB


In [19]:
traindf.corr()

Unnamed: 0,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,Sun_Hours
UNIXTime,1.0,-0.08191,-0.369919,-0.331841,-0.061548,0.156535,0.17396,-0.820021
Radiation,-0.08191,1.0,0.736029,0.116763,-0.226432,-0.238388,0.07458,0.052021
Temperature,-0.369919,0.736029,1.0,0.310439,-0.281192,-0.27007,-0.032578,0.305401
Pressure,-0.331841,0.116763,0.310439,1.0,-0.22248,-0.229222,-0.082508,0.153105
Humidity,-0.061548,-0.226432,-0.281192,-0.22248,1.0,0.001631,-0.211095,0.145714
WindDirection(Degrees),0.156535,-0.238388,-0.27007,-0.229222,0.001631,1.0,0.071282,-0.084521
Speed,0.17396,0.07458,-0.032578,-0.082508,-0.211095,0.071282,1.0,-0.15947
Sun_Hours,-0.820021,0.052021,0.305401,0.153105,0.145714,-0.084521,-0.15947,1.0


In [20]:
traindf.isnull().sum()

UNIXTime                  0
Radiation                 0
Temperature               0
Pressure                  0
Humidity                  0
WindDirection(Degrees)    0
Speed                     0
Sun_Hours                 0
dtype: int64

In [21]:
testdf["TimeSunRise"] = pd.to_datetime(testdf["TimeSunRise"]) 


In [22]:
testdf['H1'] = testdf.TimeSunRise.dt.hour

In [23]:
testdf["TimeSunSet"] = pd.to_datetime(testdf["TimeSunSet"]) 
testdf['H2'] = testdf.TimeSunSet.dt.hour

In [24]:
testdf['Sun_Hours'] = testdf['H2'] - testdf['H1'] 

In [25]:
testdf.head()

Unnamed: 0,id,UNIXTime,Data,Time,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet,H1,H2,Sun_Hours
0,0,1478720107,11/9/2016 12:00:00 AM,09:35:07,59,30.47,44,312.67,3.37,2020-05-08 06:28:00,2020-05-08 17:45:00,6,17,11
1,1,1474063503,9/16/2016 12:00:00 AM,12:05:03,59,30.48,83,38.01,6.75,2020-05-08 06:10:00,2020-05-08 18:25:00,6,18,12
2,2,1476109221,10/10/2016 12:00:00 AM,04:20:21,47,30.39,78,213.62,5.62,2020-05-08 06:16:00,2020-05-08 18:03:00,6,18,12
3,3,1481475056,12/11/2016 12:00:00 AM,06:50:56,45,30.4,98,176.63,4.5,2020-05-08 06:47:00,2020-05-08 17:44:00,6,17,11
4,4,1477493117,10/26/2016 12:00:00 AM,04:45:17,45,30.4,34,175.89,6.75,2020-05-08 06:21:00,2020-05-08 17:52:00,6,17,11


In [26]:
testdf = testdf.drop(columns= ['Data'])
testdf = testdf.drop(columns= ['TimeSunRise'])
testdf = testdf.drop(columns= ['TimeSunSet'])
testdf = testdf.drop(columns= ['Time'])
testdf = testdf.drop(columns= ['H1'])
testdf = testdf.drop(columns= ['H2'])

In [27]:
testdf.head()

Unnamed: 0,id,UNIXTime,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,Sun_Hours
0,0,1478720107,59,30.47,44,312.67,3.37,11
1,1,1474063503,59,30.48,83,38.01,6.75,12
2,2,1476109221,47,30.39,78,213.62,5.62,12
3,3,1481475056,45,30.4,98,176.63,4.5,11
4,4,1477493117,45,30.4,34,175.89,6.75,11


In [28]:
testdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8172 entries, 0 to 8171
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      8172 non-null   int64  
 1   UNIXTime                8172 non-null   int64  
 2   Temperature             8172 non-null   int64  
 3   Pressure                8172 non-null   float64
 4   Humidity                8172 non-null   int64  
 5   WindDirection(Degrees)  8172 non-null   float64
 6   Speed                   8172 non-null   float64
 7   Sun_Hours               8172 non-null   int64  
dtypes: float64(3), int64(5)
memory usage: 510.9 KB


In [29]:
testdf = testdf.drop(columns= ['id'])

In [30]:
X=traindf.drop('Radiation', axis=1)
y=traindf.Radiation


In [31]:

X_train, X_test, y_train, y_test = tts(X, y)

In [32]:
linreg=LinReg(normalize=True)

linreg.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [33]:
train_score=linreg.score(X, y)   # R2
test_score=linreg.score(X, y)

print (train_score, test_score)

0.5989143266057924 0.5989143266057924


In [34]:
knnr=KNNR(n_neighbors=3)
knnr.fit(X, y)

train_score=knnr.score(X, y) #R2
test_score=knnr.score(X, y)

print ('train R2:',train_score, '-- test R2:', test_score)

train R2: 0.9749846950487536 -- test R2: 0.9749846950487536


In [35]:
y_pred=knnr.predict(testdf)

In [36]:
y_pred.shape

(8172,)

In [37]:
from sklearn.metrics import mean_squared_error as mse

#mse(X_test, y_pred)

In [38]:
gbr=GBR()
gbr.fit(X_train, y_train)

train_score=gbr.score(X_train, y_train) #R2
test_score=gbr.score(X_test, y_test)
gbr=GBR()
gbr.fit(X_train, y_train)

train_score=gbr.score(X_train, y_train) #R2
test_score=gbr.score(X_test, y_test)

print ('train R2:',train_score, '-- test R2:', test_score)

train R2: 0.7714288589052148 -- test R2: 0.7628563552205285


In [39]:
y_pred1=gbr.predict(testdf)

In [40]:
y_pred1.shape

(8172,)

In [41]:
Radiation = pd.DataFrame(y_pred)

In [42]:
Radiation.rename(columns={list(Radiation)[0]:'Radiation'}, inplace=True)
Radiation.columns

Index(['Radiation'], dtype='object')

In [43]:
df = pd.DataFrame(columns=[test["id"], Radiation['Radiation']])

In [44]:
df = df.T

In [45]:
df.reset_index(inplace=True) 
df 


Unnamed: 0,id,Radiation
0,0,685.640000
1,1,1136.193333
2,2,1.230000
3,3,4.133333
4,4,1.216667
...,...,...
8167,8167,514.436667
8168,8168,1.233333
8169,8169,13.346667
8170,8170,1.223333


In [46]:
df.to_csv('predicts3.csv', index=False)

In [47]:
df1= pd.read_csv('predicts3.csv')
df1

Unnamed: 0,id,Radiation
0,0,685.640000
1,1,1136.193333
2,2,1.230000
3,3,4.133333
4,4,1.216667
...,...,...
8167,8167,514.436667
8168,8168,1.233333
8169,8169,13.346667
8170,8170,1.223333
