In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

## Load DataSet from the path

In [2]:
dataSet = pd.read_json('/home/sampras/MachineLearning/YieldImprovement/YI_OneHotEncoding/stageOneData.json')
dataSet.head()

Unnamed: 0,Quality,R_RICE_IN_KG,R_WATER_IN_LIT,Temp 1,Temp Manintain time Min,Temp2,Temp Maintain time,Temp3,R_PRESSURE,Steam inlet temp Deg,R_CHEMICAL_LITRE
0,GOOD,100,130,50,10,70,15,110,1.0,130,1.0
1,GOOD,200,260,50,10,70,15,110,2.0,130,2.0
2,GOOD,200,330,50,12,70,17,110,1.8,130,2.4
3,GOOD,300,390,50,12,70,17,110,3.0,130,3.0
4,GOOD,400,520,50,14,70,19,110,4.0,130,4.0


## Separation of Numerical and Categorical Columns

In [3]:
dataSet_dict = dict(dataSet.dtypes)

numeric_columns=[]
categorial_columns=[]
for i in dataSet_dict:
    if dataSet_dict[i] in ['float64', 'int64', 'float32', 'int32']:
        numeric_columns.append(i)
    else:
        categorial_columns.append(i)

In [4]:
dataSetCopy = dataSet.copy()

In [5]:
cat = dataSetCopy[categorial_columns]

In [6]:
cat.head()

Unnamed: 0,Quality
0,GOOD
1,GOOD
2,GOOD
3,GOOD
4,GOOD


## Use OneHot  Encoder to encode the categorical column

In [7]:
encode = pd.get_dummies(cat)
encode.head()

Unnamed: 0,Quality_AVERAGE,Quality_BAD,Quality_GOOD
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


## Concatenate the encoded data to the original data

In [8]:
dataSetCopyConcat = pd.concat([dataSetCopy,encode], axis=1)
dataSetCopyConcat.head()

Unnamed: 0,Quality,R_RICE_IN_KG,R_WATER_IN_LIT,Temp 1,Temp Manintain time Min,Temp2,Temp Maintain time,Temp3,R_PRESSURE,Steam inlet temp Deg,R_CHEMICAL_LITRE,Quality_AVERAGE,Quality_BAD,Quality_GOOD
0,GOOD,100,130,50,10,70,15,110,1.0,130,1.0,0,0,1
1,GOOD,200,260,50,10,70,15,110,2.0,130,2.0,0,0,1
2,GOOD,200,330,50,12,70,17,110,1.8,130,2.4,0,0,1
3,GOOD,300,390,50,12,70,17,110,3.0,130,3.0,0,0,1
4,GOOD,400,520,50,14,70,19,110,4.0,130,4.0,0,0,1


## Drop the Categorical column from the Concatenated DataFrame

In [9]:
dataSetCopyDrop = dataSetCopyConcat.drop('Quality',axis=1)
dataSetCopyDrop.head()

Unnamed: 0,R_RICE_IN_KG,R_WATER_IN_LIT,Temp 1,Temp Manintain time Min,Temp2,Temp Maintain time,Temp3,R_PRESSURE,Steam inlet temp Deg,R_CHEMICAL_LITRE,Quality_AVERAGE,Quality_BAD,Quality_GOOD
0,100,130,50,10,70,15,110,1.0,130,1.0,0,0,1
1,200,260,50,10,70,15,110,2.0,130,2.0,0,0,1
2,200,330,50,12,70,17,110,1.8,130,2.4,0,0,1
3,300,390,50,12,70,17,110,3.0,130,3.0,0,0,1
4,400,520,50,14,70,19,110,4.0,130,4.0,0,0,1


## Separate independent variable (x) & dependent variable (y)

In [10]:
xArray = ['R_CHEMICAL_LITRE','R_PRESSURE','R_RICE_IN_KG','R_WATER_IN_LIT','Quality_AVERAGE','Quality_BAD','Quality_GOOD']
x = dataSetCopyDrop[xArray]
y = dataSetCopyDrop.iloc[:,[2,3,4,5,6,8]].values

## Split Train(80%) and Test(20%) Data

In [11]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42)

## Build the Linear Regression model

In [12]:
reg=LinearRegression()
reg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predict value for the 20% test data

In [13]:
y_pred=reg.predict(x_test)

## Check for RMSE and R^2
- R^2 value ranges from 0 to 1

In [14]:
from sklearn.metrics import r2_score,mean_squared_error

rmse=np.sqrt(mean_squared_error(y_test,y_pred))
r2=r2_score(y_test,y_pred)
print('RMSE=',rmse)
print('R2 Score',r2)

RMSE= 6.07939079126866
R2 Score 0.21872478740287263


# Test model with external test data

## Load the test data

In [15]:
testData = pd.read_json('/home/sampras/MachineLearning/YieldImprovement/YI_OneHotEncoding/test_data.json')
testData

Unnamed: 0,Quality,R_RICE_IN_KG,R_WATER_IN_LIT,R_PRESSURE,R_CHEMICAL_LITRE
0,GOOD,300,330,1,1
1,GOOD,200,230,2,2
2,GOOD,100,130,1,1
3,AVERAGE,3000,3030,1,1
4,BAD,30,30,6,6


## Separate the Numeric and Categorical Columns

In [16]:
testdataSet_dict = dict(testData.dtypes)

test_numeric_columns=[]
test_categorial_columns=[]
for i in testdataSet_dict:
    if testdataSet_dict[i] in ['float64', 'int64', 'float32', 'int32']:
        test_numeric_columns.append(i)
    else:
        test_categorial_columns.append(i)

In [17]:
testDataCopy = testData.copy()

In [18]:
catTest = testDataCopy[test_categorial_columns]
catTest

Unnamed: 0,Quality
0,GOOD
1,GOOD
2,GOOD
3,AVERAGE
4,BAD


## Use OneHot  Encoder to encode the test categorical column

In [19]:
encodeTest = pd.get_dummies(catTest)
encodeTest

Unnamed: 0,Quality_AVERAGE,Quality_BAD,Quality_GOOD
0,0,0,1
1,0,0,1
2,0,0,1
3,1,0,0
4,0,1,0


## Concatenate the encoded test data to the original data

In [20]:
testDataConcat = pd.concat([testDataCopy,encodeTest], axis=1)
testDataConcat

Unnamed: 0,Quality,R_RICE_IN_KG,R_WATER_IN_LIT,R_PRESSURE,R_CHEMICAL_LITRE,Quality_AVERAGE,Quality_BAD,Quality_GOOD
0,GOOD,300,330,1,1,0,0,1
1,GOOD,200,230,2,2,0,0,1
2,GOOD,100,130,1,1,0,0,1
3,AVERAGE,3000,3030,1,1,1,0,0
4,BAD,30,30,6,6,0,1,0


## Drop the Categorical column from the Concatenated DataFrame

In [21]:
testDataDrop = testDataConcat.drop(['Quality'],axis=1)
testDataDrop

Unnamed: 0,R_RICE_IN_KG,R_WATER_IN_LIT,R_PRESSURE,R_CHEMICAL_LITRE,Quality_AVERAGE,Quality_BAD,Quality_GOOD
0,300,330,1,1,0,0,1
1,200,230,2,2,0,0,1
2,100,130,1,1,0,0,1
3,3000,3030,1,1,1,0,0
4,30,30,6,6,0,1,0


## Predict value for the test data

In [22]:
y_pred = reg.predict(testDataDrop)

In [23]:
y_pred

array([[  171.82690655,   351.23519282,  -127.14357942,   183.0080435 ,
          112.80153355,   130.        ],
       [  133.55549106,   245.16385421,   -65.48673338,   131.12072503,
          111.89192034,   130.        ],
       [   95.26740765,   139.08647449,    -3.83767139,    79.23601493,
          110.97581858,   130.        ],
       [ 1206.06327803,  3214.25358529, -1790.63482688,  1583.06229537,
          137.8066738 ,   130.        ],
       [   65.66381198,    45.4998807 ,    53.73626273,    32.34780702,
          111.66691687,   130.        ]])

## Convert the Predicted array to dataframe

In [24]:
df = pd.DataFrame(data=y_pred, columns=["Temp 1", "Temp Manintain time Min", "Temp2","Temp Maintain time","Temp3","Steam inlet temp Deg"])

In [25]:
df

Unnamed: 0,Temp 1,Temp Manintain time Min,Temp2,Temp Maintain time,Temp3,Steam inlet temp Deg
0,171.826907,351.235193,-127.143579,183.008043,112.801534,130.0
1,133.555491,245.163854,-65.486733,131.120725,111.89192,130.0
2,95.267408,139.086474,-3.837671,79.236015,110.975819,130.0
3,1206.063278,3214.253585,-1790.634827,1583.062295,137.806674,130.0
4,65.663812,45.499881,53.736263,32.347807,111.666917,130.0
