In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing data-sets
df1 = pd.read_csv('../input/solar-power-generation-data/Plant_2_Generation_Data.csv')
df2 = pd.read_csv('../input/solar-power-generation-data/Plant_2_Weather_Sensor_Data.csv')

In [None]:
#changing date-time into proper format
df1['DATE_TIME'] = pd.to_datetime(df1['DATE_TIME'],format = '%Y-%m-%d %H:%M:%S')
df1['DATE'] = df1['DATE_TIME'].apply(lambda x:x.date())
df1['TIME'] = df1['DATE_TIME'].apply(lambda x:x.time())
df1['DATE'] = pd.to_datetime(df1['DATE'],format = '%Y-%m-%d')

In [None]:
df2['DATE_TIME'] = pd.to_datetime(df2['DATE_TIME'],format = '%Y-%m-%d %H:%M:%S')
df2['DATE'] = df2['DATE_TIME'].apply(lambda x:x.date())
df2['TIME'] = df2['DATE_TIME'].apply(lambda x:x.time())
df2['DATE'] = pd.to_datetime(df2['DATE'],format = '%Y-%m-%d')

In [None]:
#generation data of plant 2
df1.info()

In [None]:
#weather sensor data of plant 2
df2.info()

In [None]:
df3 = pd.merge(df1, df2,on='DATE_TIME',how='left')

In [None]:
#merged data-set
df3.info()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df3['IRRADIATION'],df3['MODULE_TEMPERATURE'],linestyle='',marker='o',alpha=0.5,c='r')
plt.xlabel('irradiation')
plt.ylabel('module temp')
plt.show()

here, the irradiation and module temperature vary inversely with each other throughout the given dataset

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df3['AC_POWER'],df3['DC_POWER'],linestyle='',marker='o',alpha=0.5,c='k')
plt.xlabel('ac power')
plt.ylabel('dc power')
plt.show()

here, the ac power and dc power are also linearly related to each other in the given data-set

In [None]:
df3.info()

In [None]:
#linear regression for irradiation and module temperature
#extracting two columns 
X = df3.iloc[:,13:14] #irradiation
y = df3.iloc[:,12] #module temperature

In [None]:
X.ndim

In [None]:
y.ndim

In [None]:
X.shape

In [None]:
y.shape

In [None]:
plt.scatter(X,y)

the above graph depicts a linear relation between X and y

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

In [None]:
y_pred = lin_reg.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
plt.scatter(X_test,y_test,color='r',label='actual')
plt.scatter(X_test,y_pred,color='k',label='predicted')
plt.legend()
plt.show()

the graph showcases the variation between the actual and the predicted value 

In [None]:
#slope
lin_reg.coef_

In [None]:
#y-intercept
lin_reg.intercept_

In [None]:
lin_reg.predict([[0.6]])

In [None]:
#using actual formulae (y=mx+c)
34.38535982*0.6+24.732291883106374

In [None]:
#choosing a random source key and plotting the graphs for date - time vs total yield and daily yield to find out at what point of time is the yield(efficiency of the plant) decreasing
df1['HOUR'] = pd.to_datetime(df1['TIME'],format='%H:%M:%S').dt.hour
df1['MINUTES'] = pd.to_datetime(df1['TIME'],format='%H:%M:%S').dt.minute


In [None]:
df1.info()

In [None]:
df1

In [None]:
df1.isnull()

In [None]:
# extracting columns for a particular source key
gg = df1.loc[df1['SOURCE_KEY'] == 'Et9kgGMDl729KT4']

In [None]:
gg

In [None]:
gg.info()

In [None]:
gg['DATE'].value_counts()

the date 2020-05-20 has very low value counts 

In [None]:
# checking the daily yield for that particular inverter on a particular date 
gg1 = gg[gg['DATE']=='2020-05-20']

In [None]:
gg1

In [None]:
plt.figure(figsize=(12,8))
plt.plot(gg1['HOUR'],gg1['DAILY_YIELD'] , label = 'yield per hour')
plt.xlabel('time')
plt.ylabel('daily yield')
plt.legend()
plt.show()

In [None]:
# for detailed analysis at 3pm
gg2 = gg1[gg1['HOUR']== 15 ]

In [None]:
gg2

In [None]:
plt.figure(figsize=(12,8))
plt.plot(gg2['MINUTES'],gg2['DAILY_YIELD'] , label = 'yield per minute at 3pm')
plt.xlabel('time')
plt.ylabel('daily yield')
plt.legend()
plt.show()

it is a linear graph, which means the yield is increasing as the time goes by 

In [None]:
#for total yield
plt.figure(figsize=(12,8))
plt.plot(gg1['HOUR'],gg1['TOTAL_YIELD'] , label = ' total yield per hour')
plt.xlabel('time')
plt.ylabel('total yield')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.plot(gg2['MINUTES'],gg2['TOTAL_YIELD'] , label = ' total yield per minute at 3pm')
plt.xlabel('time')
plt.ylabel('total yield')
plt.legend()
plt.show()

In [None]:
#linear regression for Et9kgGMDl729KT4 on 2020-05-20
gg1.info()

In [None]:
X = gg1.iloc[:,9:10]
y = gg1.iloc[:,5]

In [None]:
X.ndim

In [None]:
y.ndim

In [None]:
X.shape

In [None]:
y.shape

In [None]:
plt.scatter(X,y)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

In [None]:
y_pred = lin_reg.predict(X_test)
y_pred

In [None]:
y_test

In [None]:
plt.figure(figsize=(12,8))
plt.plot(X_test,y_test,color ='k',label='actual')
plt.plot(X_test,y_pred,color ='orange',label='predicted')
plt.xlabel('daily yield')
plt.ylabel('time')
plt.legend()
plt.show()

scatter plot is ineffective because the dataset is too small for a split into test and train.
so linear graphs are a better option

In [None]:
lin_reg.coef_

In [None]:
lin_reg.intercept_

In [None]:
lin_reg.predict([[16.1]])

In [None]:
148.67996174*16.1+1427.386187298328

In [None]:
!pip install flask-ngrok

In [None]:
#creating a website for predicting value of y 
from flask_ngrok import run_with_ngrok
from flask import Flask

app = Flask(__name__)
run_with_ngrok(app)

@app.route('/')
def home():
    return "welcome to ml world"

@app.route('/<float:x>')
def ml(x):
    return(str(lin_reg.predict([[x]])))
app.run()


