# Example
use diamonds dataset to predict **price** in terms of other variables. For this, 
a. estimate the parameters
b. select the imporant input features 
c. write down the predictive model 

since price is practically continous therefore we need to apply linear regression:
to do the following tasks:
1.   parameter learning
2.   feature selection
3.   predictive model (use steps 1 and 2 and link function)
4.   prediction 
5.   Evaluate the performance of prediction (split the dataset into trainset and testset)

We can do Linear regression using 2 different ways:
1.  using statistical learning
2.  using sklearn

data modeling:
1. import the function of the model
2. create the model
3.  fit the model
4. prediction using the model

# Statistical way to recall GLM

In [69]:
#Import neccessary library
from statistics import mean
from statsmodels.api import GLM, add_constant, families
from pandas import read_csv, DataFrame, concat
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Data Preparation

In [70]:
data = read_csv('dataset/diamonds.csv')

X = data.drop(['price', 'Unnamed: 0'], axis=1)  # Input variables
y = data['price']  # Output variable

OE = OrdinalEncoder(categories=[['Fair', 'Good', 'Very Good', 'Ideal', 'Premium']])

data_cut_OE = OE.fit_transform(X[['cut']])
data_cut_DF = DataFrame(data_cut_OE)
data_cut_DF.columns = ['encoded cut']

OHE = OneHotEncoder(handle_unknown='ignore')

data_color_OHE = OHE.fit_transform(X[['color']])
data_color_DF = DataFrame(data_color_OHE.toarray())
data_color_DF.columns = OHE.get_feature_names_out()

data_clarity_OHE = OHE.fit_transform(X[['clarity']])
data_clarity_DF = DataFrame(data_clarity_OHE.toarray())
data_clarity_DF.columns = OHE.get_feature_names_out()

# Merging multiple DataFrames

In [71]:
X_binary = concat([data_color_DF, data_clarity_DF], axis=1) # Excluding data_cut_DF since it is ordinal
X_scalable = X[['carat', 'depth', 'table', 'length_mm', 'width_mm', 'depth_mm']]  # Orginal numeric columns
X_scalable = concat([X_scalable, data_cut_DF], axis=1)  # Including encoded ordinal columns

# Applying StandardScaler

In [72]:
X_scaled = StandardScaler().fit_transform(X_scalable)
X_scaled_DF = DataFrame(X_scaled)
X_scaled_DF.columns = X_scalable.columns

In [73]:
X_PREP = concat([X_scaled_DF, X_binary], axis=1)  # Prepared Data

# Data Modelling for feature selection / reduced model

In [74]:
X = add_constant(X_PREP)  # Ading constant column to handle bias; 
model_full = GLM(y, X, family=families.Gaussian(families.links.log())).fit()

In [75]:
pvalues = dict(model_full.pvalues)  # Identifying the columns to drop
drop_columns = []
for x in pvalues:
    if pvalues[x] >= 0.05:
        drop_columns.append(x)
# print(drop_columns)
X_red = X.drop(drop_columns, axis=1)

Predictive Model:  yhat=5.455245+0.001909*ZN+0.022373*INDUS+0.028275*RAD-0.002238*MEDV    

In [76]:
rmse_full = []
rmse_red = []
for i in range(20):
    X = add_constant(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    #*************Full Data Modelling*************
    
    # Creating the Linear Regression Model
    model_full = GLM(y_train, X_train, family=families.Gaussian(families.links.log())).fit()

    # Predicting using Test-set
    pred_full = model_full.predict(X_test)
    rmse_full.append(sqrt(mean_squared_error(y_test, pred_full)))
    
    #*************Reduced Data Modelling*************
    
    # Creating the Linear Regression Model
    X = add_constant(X_red)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model_reduced = GLM(y_train, X_train, family=families.Gaussian(families.links.log())).fit()
    
    # Predicting using Test-set
    pred_red = model_reduced.predict(X_test)
    rmse_red.append(sqrt(mean_squared_error(y_test, pred_red)))

[mean(rmse_full), mean(rmse_red)]

[228.54546392655357, 231.82159530363964]

# Deployment phase 

In [77]:
X = add_constant(X_red)
model_dep = GLM(y, X, family=families.Gaussian(families.links.log())).fit()  # create the linear reg model using

# Predicting using testset

In [78]:
new = X.tail(1)
pred_red = model_dep.predict(new)
print(pred_red)
y.tail(1)

858    2824.675874
dtype: float64


858    2871
Name: price, dtype: int64

# Statistical & sklearn way to recall GLM

In [88]:
# Import neccessary library
from statistics import mean
from statsmodels.api import GLM, add_constant, families
from pandas import read_csv, DataFrame, concat
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Data Preparation

In [89]:
data = read_csv('dataset/diamonds.csv')

X = data.drop(['price', 'Unnamed: 0'], axis=1)  # Input variables
y = data['price']  # Output variable

OE = OrdinalEncoder(categories=[['Fair', 'Good', 'Very Good', 'Ideal', 'Premium']])
data_cut_OE = OE.fit_transform(X[['cut']])
data_cut_DF = DataFrame(data_cut_OE)
data_cut_DF.columns = ['encoded cut']

OHE = OneHotEncoder(handle_unknown='ignore')

data_color_OHE = OHE.fit_transform(X[['color']])
data_color_DF = DataFrame(data_color_OHE.toarray())
data_color_DF.columns = OHE.get_feature_names_out()

data_clarity_OHE = OHE.fit_transform(X[['clarity']])
data_clarity_DF = DataFrame(data_clarity_OHE.toarray())
data_clarity_DF.columns = OHE.get_feature_names_out()

# Merging multiple DataFrames

In [90]:
X_binary = concat([data_color_DF, data_clarity_DF], axis=1) # Excluding data_cut_DF since it is ordinal
X_scalable = X[['carat', 'depth', 'table', 'length_mm', 'width_mm', 'depth_mm']]  # Orginal numeric columns
X_scalable = concat([X_scalable, data_cut_DF], axis=1) # Including encoded ordinal columns

# Applying standard scalar

In [91]:
X_scaled = StandardScaler().fit_transform(X_scalable)
X_scaled_DF = DataFrame(X_scaled)
X_scaled_DF.columns = X_scalable.columns

In [92]:
X_PREP = concat([X_scaled_DF, X_binary], axis=1)  # Prepared Data

# Data Modelling for feature selection / reduced model

In [93]:
X = add_constant(X_PREP)  # Ading constant column to handle bias
model_full = GLM(y, X, family=families.Gaussian(families.links.log())).fit()

In [94]:
pvalues = dict(model_full.pvalues)  # Identifying the columns to drop
drop_columns = []
for x in pvalues:
    if pvalues[x] >= 0.05:
        drop_columns.append(x)
# print(drop_columns)
X_red = X.drop(drop_columns, axis=1)

Predictive Model:  yhat=5.455245+0.001909*ZN+0.022373*INDUS+0.028275*RAD-0.002238*MEDV

In [95]:
rmse_st_full = []
rmse_st_red = []
rmse_skl_full = []
rmse_skl_red = []

model_skl = LinearRegression()
for i in range(20):
    #************************* Full Data Modelling  ***************************#
    X = add_constant(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # statsmodels
    model_st_full = GLM(y_train, X_train, family=families.Gaussian(families.links.log())).fit()

    # Predicting using Test-set
    pred_st_full = model_st_full.predict(X_test)
    rmse_st_full.append(sqrt(mean_squared_error(y_test, pred_st_full)))
    
    # sklearn
    model_skl.fit(X_train, y_train)
    
    # Predicting using Test-set
    pred_skl_full = model_skl.predict(X_test)
    rmse_skl_full.append(sqrt(mean_squared_error(y_test, pred_skl_full)))
    
    #************************* Reduced Data Modelling  ***************************#
    X = add_constant(X_red)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # statsmodels
    model_st_red = GLM(y_train, X_train, family=families.Gaussian(families.links.log())).fit()  # create the linear reg model using
    
    # Predicting using Test-set
    pred_red = model_reduced.predict(X_test)
    rmse_st_red.append(sqrt(mean_squared_error(y_test, pred_red)))
    
    # sklearn
    model_skl.fit(X_train, y_train)
    
    # Predicting using Test-set
    pred_skl_red = model_skl.predict(X_test)
    rmse_skl_red.append(sqrt(mean_squared_error(y_test, pred_skl_red)))
    

[mean(rmse_st_full), mean(rmse_st_red), mean(rmse_skl_full), mean(rmse_skl_red)]

[232.67484661804366, 221.39497367924946, 163.26019247975657, 162.2927343939966]

# Deployment Phase

In [96]:
X = add_constant(X_red)
model_skl_red_dep = model_skl.fit(X, y)

# Predicting using Test-set
# Predict the tax using the linear regression for the last row in the boston_house-prices dataset

In [97]:
prediction = model_skl_red_dep.predict(X.tail(1))
print(prediction)
y.tail(1)

[2809.03468253]


858    2871
Name: price, dtype: int64

# Logistic regression
Use boston_house price dataset, 
1.   Consider CHAS as the output variable 
2.   the remaining variables are inputs 
apply logistic regression to do the following tasks:
*  parameter estimation 
*   attribute selection 
*  predictive model
*   prediction

In [98]:
# Import neccessary library
from statistics import mean
from statsmodels.api import GLM, add_constant, families
from pandas import read_csv, DataFrame
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Data Preparation

In [99]:
data = read_csv('dataset/boston_house_prices.csv')
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [100]:
data['CHAS'].unique()

array([0, 1], dtype=int64)

In [101]:
X = data.drop('CHAS', axis=1)  # input
y = data['CHAS']   # output

# Data Modelling for feature selection / reduced model

# Logistic regression using statsmodels

In [104]:
X = add_constant(X)  # add intercept
model_st = GLM(y, X, family=families.Binomial()).fit()

In [105]:
pvalues = dict(model_st.pvalues)  # Identifying the columns to drop
drop_columns = []
for x in pvalues:
    if pvalues[x] >= 0.05:
        drop_columns.append(x)
# print("Columns to be romoved:", drop_columns)

In [106]:
X_red = X.drop(drop_columns, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=3)  # testset is 10%

Predictive model based on above results:

that_i = 0.0952 * INDUS_i + 0.2191 * RAD_i - 0.0087 * TAX_i + 0.0693 * MEDV_i

1.   phat_i = 1 / (1 + exp(-that_i))
2.   yhat_i = 1 if phat_i >= 0.5 else yhat_i = 0

In [107]:
model_st_red = GLM(y_train, X_train, family=families.Binomial()).fit() # logistic regression

In [108]:
prediction_st_raw = model_st_red.predict(X_test)
prediction_st_raw

224    0.245663
137    0.045313
453    0.067303
303    0.067967
254    0.005386
37     0.029412
442    0.112732
417    0.000672
16     0.022943
209    0.024132
126    0.208278
157    0.383280
196    0.020893
266    0.161863
404    0.000053
399    0.013794
116    0.023814
127    0.037455
134    0.025365
201    0.007439
503    0.053354
161    0.534502
287    0.025959
73     0.026413
439    0.035295
325    0.039428
112    0.018129
310    0.017442
14     0.016644
230    0.053991
27     0.009174
291    0.053341
479    0.018896
102    0.007754
124    0.321740
376    0.010298
248    0.018770
237    0.111496
354    0.002151
334    0.037681
153    0.216423
392    0.014511
218    0.113605
458    0.048127
357    0.211703
101    0.023423
269    0.027910
211    0.023232
348    0.015260
103    0.012016
349    0.003932
dtype: float64

# Converting predictions to binary predicted values

In [109]:
prediction_st = [1 if prediction > 0.5 else 0 for prediction in prediction_st_raw]
df = DataFrame({"Prediction": prediction_st, "Actual": y_test})
df.head()


Unnamed: 0,Prediction,Actual
224,0,0
137,0,0
453,0,0
303,0,0
254,0,0


# Logistic regression using sklearn

In [110]:
model_skl = LogisticRegression(max_iter=10000).fit(X_train, y_train)
prediction_skl = model_skl.predict(X_test)

In [111]:
df = DataFrame({"Prediction_st": prediction_st, "Prediction_skl": prediction_skl, "Actual": y_test})
df.head()

Unnamed: 0,Prediction_st,Prediction_skl,Actual
224,0,0,0
137,0,0,0
453,0,0,0
303,0,0,0
254,0,0,0


# Compute recall and accuracy

In [112]:
from sklearn.metrics import accuracy_score, recall_score
# Prediction_st
acc_st = accuracy_score(y_test, prediction_st)
re_st = recall_score(y_test, prediction_st)
acc_skl = accuracy_score(y_test, prediction_skl)
re_skl = recall_score(y_test, prediction_skl)
print([acc_st, acc_skl])
[re_st, re_skl]


[0.8823529411764706, 0.8823529411764706]


[0.0, 0.0]