# Basic Imports

In [1]:
# this is specific to macos X , windows/linux users dont need this 
# without this , while using xgboost , you get errors , kernel dies
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import seaborn as sns
import pandas as pd
import math as mt

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

# Variable Transformation

In [10]:
x_data = pd.DataFrame(np.random.random(size= (4000,5)), columns= ['x1','x2','x3','x4','x5'])
x_data.shape

(4000, 5)

In [11]:
x_data.head(5)

Unnamed: 0,x1,x2,x3,x4,x5
0,0.220938,0.843987,0.498426,0.894416,0.726749
1,0.054855,0.5234,0.903358,0.204263,0.184858
2,0.46237,0.893711,0.025692,0.361849,0.280721
3,0.964271,0.572357,0.775846,0.883013,0.674869
4,0.752567,0.127214,0.800428,0.241559,0.967697


In [12]:
# Linear Relationship (y -> X)
x_data['y1']= 1.5 + x_data['x1'] + x_data['x2'] + x_data['x3'] + x_data['x4'] + x_data['x5'] + np.random.random(size=4000)

# Quadratic Relationship
x_data['y2']= 1.5 + (x_data['x1']**2) + (x_data['x2']**2) + (x_data['x3']**2) + (x_data['x4']**2) + (x_data['x5']**2) + np.random.random(size=4000)

# Logarithmic Relationship
x_data['y3']= 1.5 + (np.log(x_data['x1'])) + (np.log(x_data['x2'])) + (np.log(x_data['x3'])) + (np.log(x_data['x4'])) + (np.log(x_data['x5'])) + np.random.random(size=4000)

# Inverse Relationship
x_data['y4']= 1.5 + (1/x_data['x1']) + (1/x_data['x2']) + (1/x_data['x3']) + (1/x_data['x4']) + (1/x_data['x5']) + np.random.random(size=4000)

# Ratios
x_data['y5']= 1.5 + (x_data['x2']/x_data['x1'])  + (x_data['x3']/x_data['x2'])  + (x_data['x4']/x_data['x3']) + (x_data['x5']/x_data['x4']) + np.random.random(size=4000)

x_data.head(5)

Unnamed: 0,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5
0,0.220938,0.843987,0.498426,0.894416,0.726749,5.313187,4.470377,-0.771405,11.760815,9.254034
1,0.054855,0.5234,0.903358,0.204263,0.184858,3.894253,3.255108,-5.403951,33.792295,14.229336
2,0.46237,0.893711,0.025692,0.361849,0.280721,4.515149,3.568327,-5.109311,50.983507,19.320714
3,0.964271,0.572357,0.775846,0.883013,0.674869,6.198952,5.114783,0.580162,8.818041,6.163644
4,0.752567,0.127214,0.800428,0.241559,0.967697,4.980558,3.870724,-2.242343,17.985796,12.313194


In [13]:
train, test = train_test_split(x_data, test_size=0.2, random_state=2)

x_train= train.drop(['y1','y2','y3','y4','y5'],1)
y_train= train[['y1','y2','y3','y4','y5']]

x_test= test.drop(['y1','y2','y3','y4','y5'],1)
y_test= test[['y1','y2','y3','y4','y5']]

In [14]:
results= pd.DataFrame(columns=["Algo","Relation","MAE"])

In [15]:
def Random_Forest(x,y,relation):
    
    rf= RandomForestRegressor(n_estimators=200)
    rf.fit(x,y_train[y])
    y_pred=rf.predict(x_test)
    mae= np.round(mean_absolute_error(y_test[y],y_pred),4)
    global results
    results= results.append({'Algo':"Random Forest",'Relation':relation,'MAE':mae}, ignore_index=True)
    
    #return(print("Algo: Random Forest\nRelation: "+relation+" \nMean Absolute error: ",mean_absolute_error(y_test[y],y_pred)))
    

def SV_M (x,y, relation):
    svr= SVR()
    svr.fit(x,y_train[y])
    y_pred=svr.predict(x_test)
    mae= np.round(mean_absolute_error(y_test[y],y_pred),4)
    global results
    results= results.append({'Algo':"SVM",'Relation':relation,'MAE':mae}, ignore_index=True)
   # return(print("Algo: SVM \nRelation: "+relation+" \nMean Absolute error: ",mean_absolute_error(y_test[y],y_pred)))


def XG_B(x,y, relation):
    xgb= XGBRegressor(n_estimators=200,learning_rate=.01,objective='reg:squarederror')
    xgb.fit(x,y_train[y])
    y_pred=xgb.predict(x_test)
    mae= np.round(mean_absolute_error(y_test[y],y_pred),4)
    global results
    results= results.append({'Algo':"XGB",'Relation':relation,'MAE':mae}, ignore_index=True)
   # return(print("Algo: XGB \nRelation: "+relation+" \nMean Absolute error: ",mean_absolute_error(y_test[y],y_pred)))


In [16]:
x_train.shape

(3200, 5)

In [17]:
# SET-1 (x)

Random_Forest(x_train,"y1","x")

SV_M(x_train,"y1","x")

XG_B(x_train,"y1","x")

In [18]:
# SET-2 (x^2)

Random_Forest(x_train,"y2","x^2")

SV_M(x_train,"y2","x^2")

XG_B(x_train,"y2","x^2")



In [19]:
# SET-3 (log(x))
Random_Forest(x_train,"y3","log(x)")

SV_M(x_train,"y3","log(x)")

XG_B(x_train,"y3","log(x)")

In [20]:
# SET-4 (Inverse)
Random_Forest(x_train,"y4","1/x")

SV_M(x_train,"y4","1/x")

XG_B(x_train,"y4","1/x")

In [21]:
# SET-5 (Ratios)

Random_Forest(x_train,"y5","Ratio")

SV_M(x_train,"y5","Ratio")

XG_B(x_train,"y5","Ratio")


In [22]:
results

Unnamed: 0,Algo,Relation,MAE
0,Random Forest,x,0.2716
1,SVM,x,0.2572
2,XGB,x,0.5448
3,Random Forest,x^2,0.2706
4,SVM,x^2,0.2644
5,XGB,x^2,0.4619
6,Random Forest,log(x),0.3885
7,SVM,log(x),0.4192
8,XGB,log(x),0.6337
9,Random Forest,1/x,7.3836


In [None]:
#x1 -> x6 = 1/x1
#x2 -> x7 = 1/x2
#x3 -> x8 = 1/x3
#x5 -> x10 = 1/x5

# prev - x_train = [x1, x2, .. x5]
# after transf - x_train_new = [x6, x7, .. x10]