In [103]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

In [104]:
data = pd.read_excel("test.xlsx")
data.head()

Unnamed: 0,hour,date_miladi,date_shamsi,code,unit_no,fuel_type,mvar,temp,moisture,power
0,1,2020-01-13,1398/10/23,SO,1,A,11,3,94,119
1,2,2020-01-13,1398/10/23,SO,1,A,11,3,96,119
2,3,2020-01-13,1398/10/23,SO,1,A,10,2,95,120
3,4,2020-01-13,1398/10/23,SO,1,A,11,2,95,120
4,5,2020-01-13,1398/10/23,SO,1,A,11,2,95,121


In [105]:
data=data[data["power"]>100]
data.shape

(756, 10)

In [106]:
# Change object to integer:
data["fuel_type"][data["fuel_type"]=="A"] = 1; 
data["fuel_type"][data["fuel_type"]=="B"] = 2;
data["fuel_type"][data["fuel_type"]=="C"] = 3;

In [107]:
data['fuel_type'].value_counts()

1    508
Name: fuel_type, dtype: int64

In [108]:
data["fuel_type"].fillna(method='bfill', inplace=True)

In [109]:
data.isnull().sum()

hour           0
date_miladi    0
date_shamsi    0
code           0
unit_no        0
fuel_type      0
mvar           0
temp           0
moisture       0
power          0
dtype: int64

In [110]:
X = data[["moisture","unit_no","temp","fuel_type"]]
y = data[["power"]]
y

Unnamed: 0,power
0,119
1,119
2,120
3,120
4,121
...,...
994,110
995,127
996,127
997,124


In [111]:
X

Unnamed: 0,moisture,unit_no,temp,fuel_type
0,94,1,3,1
1,96,1,3,1
2,95,1,2,1
3,95,1,2,1
4,95,1,2,1
...,...,...,...,...
994,6,6,15,1
995,8,6,16,1
996,21,6,18,1
997,11,6,19,1


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [113]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
y_train = np.array(y_train)

In [114]:
# Apply PCA to reduce the number of features to 3
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)

# Linear regression without using pca

In [115]:
model_without_pca = LinearRegression()
model_without_pca.fit(X_train, y_train)

LinearRegression()

In [116]:
# assume row is the new data point to predict with 4 features
row = np.array([45, 5, 30 ,1])
X_test_scaled = scaler.transform([row])
y_pred = model_without_pca.predict(X_test_scaled)
print(y_pred)

[[117.66512561]]


# Linear regression with using pca

In [117]:
model_pca = LinearRegression()
model_pca.fit(X_train_pca, y_train)

LinearRegression()

In [118]:
# assume row is the new data point to predict with 3 features
row = np.array([94, 1, 30 ,1])

X_test_scaled = scaler.transform([row])
X_test_pca = pca.transform(X_test_scaled)
y_pred = model_pca.predict(X_test_pca)


print(y_pred)

[[116.01208927]]


# Decision Tree Regressor

In [119]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [120]:
# Training the Decision Tree Regression model on the whole dataset
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)
y_pred=regressor.predict([[94, 1, 30 ,1]])
y_pred[0]

111.0

# Lasso Regression Model

In [121]:
from sklearn.linear_model import Lasso  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
X_train_scaled = scaler.fit_transform(X_train)
y_train = np.array(y_train)

In [122]:
model = Lasso(alpha=0.1 , normalize=True)
model.fit(X_train_scaled , y_train)


row = np.array([45, 5, 30 ,1])
X_test_scaled = scaler.transform([row])
y_pred = ridge_regression.predict(X_test_scaled)
print(y_pred[0][0])

117.25914031981426


# Ridge Regression

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
X_train_scaled = scaler.fit_transform(X_train)
y_train = np.array(y_train)


In [124]:
# Ridge Regression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
ridge = Ridge()

parameters = {"alpha":[1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
ridge_regression = GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error', cv=5)
ridge_regression.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20]},
             scoring='neg_mean_squared_error')

In [125]:
print(ridge_regression.best_params_)
print(ridge_regression.best_score_)

{'alpha': 20}
-114.6299444593025


In [126]:
row = np.array([45, 5, 30 ,1])
X_test_scaled = scaler.transform([row])
y_pred = ridge_regression.predict(X_test_scaled)
print(y_pred[0][0])

117.25914031981426


# Gradient boosting regression

In [127]:
from sklearn.ensemble import GradientBoostingRegressor

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train = np.array(y_train)

In [129]:
gbr=GradientBoostingRegressor(n_estimators=100,learning_rate=0.1,max_depth=3,random_state=42)

In [130]:
gbr.fit(X_train_scaled,y_train)
y_pred=gbr.predict(X_test_scaled)

In [131]:
row = np.array([45, 1,30 ,1])
X_test_scaled = scaler.transform([row])
y_pred = gbr.predict(X_test_scaled)
print(y_pred[0])

110.82935059686598


# SVR

In [132]:
from sklearn.svm import SVR

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train = np.array(y_train)

In [134]:
svr=SVR(kernel='linear')
svr.fit(X_train_scaled,y_train)
y_pred=svr.predict(X_test_scaled)

In [135]:
row = np.array([45,5,30 ,1])
X_test_scaled = scaler.transform([row])
y_pred = svr.predict(X_test_scaled)
print(y_pred[0])

123.7408770276624


# Rondom Forest Regressor

In [136]:
from sklearn.ensemble import RandomForestRegressor

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train = np.array(y_train)

In [138]:
rf=RandomForestRegressor(n_estimators=100,random_state=42)
rf.fit(X_train_scaled,y_train)
y_pred=rf.predict(X_test_scaled)

In [141]:
row = np.array([45,1,30 ,1])
X_test_scaled = scaler.transform([row])
y_pred = rf.predict(X_test_scaled)
print(y_pred[0])

111.01
