# Example
use diamonds dataset to predict **price** in terms of other variables. For this, 
a. estimate the parameters
b. select the imporant input features 
c. write down the predictive model 

since price is practically continous therefore we need to apply linear regression:
to do the following tasks:
1.   parameter learning
2.   feature selection
3.   predictive model (use steps 1 and 2 and link function)
4.   prediction 
5.   Evaluate the performance of prediction (split the dataset into trainset and testset)

We can do Linear regression using 2 different ways:
1.  using statistical learning
2.  using sklearn

data modeling:
1. import the function of the model
2. create the model
3.  fit the model
4. prediction using the model

# Statistical way to recall GLM

In [16]:
#Import neccessary library
from statistics import mean
import statsmodels.api as sm
from statsmodels.api import GLM, add_constant
from pandas import read_csv, DataFrame, concat
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Data Preparation

In [17]:
data = read_csv('dataset/diamonds.csv')

X = data.drop(['price', 'Unnamed: 0'], axis=1)  # Input variables
y = data['price']  # Output variable

OE = OrdinalEncoder(categories=[['Fair', 'Good', 'Very Good', 'Ideal', 'Premium']])

data_cut_OE = OE.fit_transform(X[['cut']])
data_cut_DF = DataFrame(data_cut_OE)
data_cut_DF.columns = ['encoded cut']

OHE = OneHotEncoder(handle_unknown='ignore')

data_color_OHE = OHE.fit_transform(X[['color']])
data_color_DF = DataFrame(data_color_OHE.toarray())
data_color_DF.columns = OHE.get_feature_names_out()

data_clarity_OHE = OHE.fit_transform(X[['clarity']])
data_clarity_DF = DataFrame(data_clarity_OHE.toarray())
data_clarity_DF.columns = OHE.get_feature_names_out()

# Merging multiple DataFrames

In [18]:
X_binary = concat([data_color_DF, data_clarity_DF], axis=1) # Excluding data_cut_DF since it is ordinal
X_scalable = X[['carat', 'depth', 'table', 'length_mm', 'width_mm', 'depth_mm']]  # Orginal numeric columns
X_scalable = concat([X_scalable, data_cut_DF], axis=1)  # Including encoded ordinal columns

# Applying StandardScaler

In [19]:
X_scaled = StandardScaler().fit_transform(X_scalable)
X_scaled_DF = DataFrame(X_scaled)
X_scaled_DF.columns = X_scalable.columns

In [20]:
X_PREP = concat([X_scaled_DF, X_binary], axis=1)  # Prepared Data

# Data Modelling for feature selection / reduced model

In [21]:
X = add_constant(X_PREP)  # Ading constant column to handle bias; 
model_full = GLM(y, X, family=sm.families.Gaussian(sm.families.links.log()))
res = model_full.fit()  # Fitting the model
res.summary()

0,1,2,3
Dep. Variable:,price,No. Observations:,859.0
Model:,GLM,Df Residuals:,838.0
Model Family:,Gaussian,Df Model:,20.0
Link Function:,log,Scale:,50649.0
Method:,IRLS,Log-Likelihood:,-5861.0
Date:,"Wed, 08 Mar 2023",Deviance:,42444000.0
Time:,02:11:25,Pearson chi2:,42400000.0
No. Iterations:,22,Pseudo R-squ. (CS):,1.0
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,6.0650,0.005,1159.381,0.000,6.055,6.075
carat,-0.5882,0.039,-14.960,0.000,-0.665,-0.511
depth,-0.0430,0.045,-0.958,0.338,-0.131,0.045
table,0.0243,0.004,6.350,0.000,0.017,0.032
length_mm,0.0201,0.099,0.203,0.839,-0.174,0.215
width_mm,0.3744,0.092,4.087,0.000,0.195,0.554
depth_mm,0.7281,0.171,4.270,0.000,0.394,1.062
encoded cut,0.0122,0.003,3.517,0.000,0.005,0.019
color_D,0.9806,0.009,111.662,0.000,0.963,0.998


In [22]:
pvalues = dict(res.pvalues)  # Identifying the columns to drop
for x in pvalues:
    if pvalues[x] >= 0.05:
        print(x)

depth
length_mm


In [23]:
X_red = X.drop(['depth', 'length_mm'], axis=1)

Predictive Model:  yhat=5.455245+0.001909*ZN+0.022373*INDUS+0.028275*RAD-0.002238*MEDV    

In [24]:
rmse_full = []
rmse_red = []
for i in range(20):
    X = add_constant(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    #*************Full Data Modelling*************
    
    # Creating the Linear Regression Model
    model_full_train = GLM(y_train, X_train, family=sm.families.Gaussian(sm.families.links.log()))
    model_full = model_full_train.fit()  # Fitting the model
    model_full.summary()

    # Predicting using Test-set
    pred_full = model_full.predict(X_test)
    rmse1 = sqrt(mean_squared_error(y_test, pred_full))
    rmse_full.append(rmse1)
    
    #*************Reduced Data Modelling*************
    
    # Creating the Linear Regression Model
    X = add_constant(X_red)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model_reduced_train = GLM(y_train, X_train, family=sm.families.Gaussian(sm.families.links.log()))
    model_reduced = model_reduced_train.fit()
    
    # Predicting using Test-set
    pred_red = model_reduced.predict(X_test)
    rmse2 = sqrt(mean_squared_error(y_test, pred_red))
    rmse_red.append(rmse2)

[mean(rmse_full), mean(rmse_red)]

[228.95992067970442, 230.3011580919856]

# Deployment phase 

In [25]:
X = add_constant(X_red)
model_train = GLM(y, X, family=sm.families.Gaussian(sm.families.links.log()))  # create the linear reg model using
red_model = model_train.fit() # fit the model

# Predicting using testset

In [26]:
new = X.tail(1)
pred_red = red_model.predict(new)
print(pred_red)
y.tail(1)

858    2824.587072
dtype: float64


858    2871
Name: price, dtype: int64

# Statistical & sklearn way to recall GLM

In [27]:
# Import neccessary library
from statistics import mean
import statsmodels.api as sm
from statsmodels.api import GLM, add_constant
from pandas import read_csv, DataFrame, concat
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Data Preparation

In [28]:
data = read_csv('dataset/diamonds.csv')

X = data.drop(['price', 'Unnamed: 0'], axis=1)  # Input variables
y = data['price']  # Output variable

OE = OrdinalEncoder(categories=[['Fair', 'Good', 'Very Good', 'Ideal', 'Premium']])
data_cut_OE = OE.fit_transform(X[['cut']])
data_cut_DF = DataFrame(data_cut_OE)
data_cut_DF.columns = ['encoded cut']

OHE = OneHotEncoder(handle_unknown='ignore')

data_color_OHE = OHE.fit_transform(X[['color']])
data_color_DF = DataFrame(data_color_OHE.toarray())
data_color_DF.columns = OHE.get_feature_names_out()

data_clarity_OHE = OHE.fit_transform(X[['clarity']])
data_clarity_DF = DataFrame(data_clarity_OHE.toarray())
data_clarity_DF.columns = OHE.get_feature_names_out()

# Merging multiple DataFrames

In [29]:
X_binary = concat([data_color_DF, data_clarity_DF], axis=1) # Excluding data_cut_DF since it is ordinal
X_scalable = X[['carat', 'depth', 'table', 'length_mm', 'width_mm', 'depth_mm']]  # Orginal numeric columns
X_scalable = concat([X_scalable, data_cut_DF], axis=1) # Including encoded ordinal columns

# Applying standard scalar

In [30]:
X_scaled = StandardScaler().fit_transform(X_scalable)
X_scaled_DF = DataFrame(X_scaled)
X_scaled_DF.columns = X_scalable.columns

In [31]:
X_PREP = concat([X_scaled_DF, X_binary], axis=1)  # Prepared Data

# Data Modelling for feature selection / reduced model

In [32]:
X = add_constant(X_PREP)  # Ading constant column to handle bias
model_full = GLM(y, X, family=sm.families.Gaussian(sm.families.links.log()))
res = model_full.fit()  # Fitting the model
res.summary()

0,1,2,3
Dep. Variable:,price,No. Observations:,859.0
Model:,GLM,Df Residuals:,838.0
Model Family:,Gaussian,Df Model:,20.0
Link Function:,log,Scale:,50649.0
Method:,IRLS,Log-Likelihood:,-5861.0
Date:,"Wed, 08 Mar 2023",Deviance:,42444000.0
Time:,02:12:14,Pearson chi2:,42400000.0
No. Iterations:,22,Pseudo R-squ. (CS):,1.0
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,6.0650,0.005,1159.381,0.000,6.055,6.075
carat,-0.5882,0.039,-14.960,0.000,-0.665,-0.511
depth,-0.0430,0.045,-0.958,0.338,-0.131,0.045
table,0.0243,0.004,6.350,0.000,0.017,0.032
length_mm,0.0201,0.099,0.203,0.839,-0.174,0.215
width_mm,0.3744,0.092,4.087,0.000,0.195,0.554
depth_mm,0.7281,0.171,4.270,0.000,0.394,1.062
encoded cut,0.0122,0.003,3.517,0.000,0.005,0.019
color_D,0.9806,0.009,111.662,0.000,0.963,0.998


In [33]:
pvalues = dict(res.pvalues)  # Identifying the columns to drop
for x in pvalues:
    if pvalues[x] >= 0.05:
        print(x)

depth
length_mm


In [34]:
X_red = X.drop(['depth', 'length_mm'], axis=1)

Predictive Model:  yhat=5.455245+0.001909*ZN+0.022373*INDUS+0.028275*RAD-0.002238*MEDV

In [35]:
rmse_st_full = []
rmse_st_red = []
rmse_skl_full = []
rmse_skl_red = []

model_skl = LinearRegression()
for i in range(20):
    X = add_constant(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    #************************* Full Data Modelling  ***************************#

    # Full Data Modelling -statsmodels
    model_st_full_train = GLM(y_train, X_train, family=sm.families.Gaussian(sm.families.links.log()))
    model_st_full = model_st_full_train.fit()  # Fitting the model

    # Predicting using Test-set
    pred_st_full = model_st_full.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, pred_st_full))
    rmse_st_full.append(rmse)
    
    # Full Data Modelling -sklearn
    model_skl.fit(X_train, y_train)
    
    # Predicting using Test-set
    pred_skl_full = model_skl.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, pred_skl_full))
    rmse_skl_full.append(rmse)
    
    #************************* Reduced Data Modelling  ***************************#
    
    X = add_constant(X_red)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Reduced Data Modelling -statsmodels
    
    model_st_red_train = GLM(y_train, X_train, family=sm.families.Gaussian(sm.families.links.log()))  # create the linear reg model using
    model_st_red = model_st_red_train.fit()
    
    # Predicting using Test-set
    pred_red = model_reduced.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, pred_red))
    rmse_st_red.append(rmse)
    
    # Reduced Data Modelling -sklearn
    
    model_skl.fit(X_train, y_train)
    
    # Predicting using Test-set
    pred_skl_red = model_skl.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, pred_skl_red))
    rmse_skl_red.append(rmse)
    

[mean(rmse_st_full), mean(rmse_st_red), mean(rmse_skl_full), mean(rmse_skl_red)]

[228.75860053290018,
 222.24709028747301,
 162.68406746228032,
 160.65380859457005]

# Deployment Phase

In [36]:
X = add_constant(X_red)
model_skl_red_dep = model_skl.fit(X, y)

# Predicting using Test-set

In [39]:
pred_st_red_dep = model_skl_red_dep.predict(X.tail(1))
print(pred_st_red_dep)
y.tail(1)

[2809.03468253]


858    2871
Name: price, dtype: int64