In [279]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

columns_to_use = ['KM_INT', 'FUEL','DRIVE','GEARBOX', 'AGE','Ford','Hyundai','Kia','Mahindra','Maruti Suzuki','Nissan','Tata','Toyota','Volkswagen','Black','Gray','Blue','Brown','Maroon','Red','Silver','Other Colors','Hatchback','Pickup','SUV','Sedan','1001-1500(cc)','1501-2000(cc)','2001-2500(cc)','2501-2900(cc)','2901-above(cc)']


df = pd.read_csv('Final-with-full-dummies_with_value_0and_1.csv');

In [280]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from statsmodels.stats.diagnostic import het_breuschpagan

pd.set_option('display.max_rows', None)

data=df.copy()
dependent_variable = 'PRICE_NEW'

data['AGE'] = data['AGE'].replace(0, 0.5);
df.head()
y = data[dependent_variable]
X = data[columns_to_use]


# print(model_linear.bse)


print('***********************Semi Log Summary*************************')
y_semi_log = np.log(y)
X_semi_log=X.copy()
X_semi_log = sm.add_constant(X_semi_log)  # Add a constant term for the intercept

model_semi_log = sm.OLS(y_semi_log, X_semi_log).fit()
# print(model_semi_log.summary())
# print(model_semi_log.bse)
residuals = model_semi_log.resid

# Perform Breusch-Pagan test
lm, lm_p_value, fvalue, f_p_value = het_breuschpagan(residuals, X_semi_log)

# Print the results
print("Breusch-Pagan test: Semi log")
print("LM Statistic:", lm)
print("LM Test p-value:", lm_p_value)
print("F-Statistic:", fvalue)
print("F-Test p-value:", f_p_value)




def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data


# For Semi-Log Model
# vif_semi_log = calculate_vif(X_semi_log)
# print('\nVIF for Semi-Log Model:')
# print(vif_semi_log)



***********************Semi Log Summary*************************
Breusch-Pagan test: Semi log
LM Statistic: 205.4031333235499
LM Test p-value: 2.4970846531084496e-37
F-Statistic: 18.90718443417809
F-Test p-value: 1.9250843673651448e-39


In [281]:
print('Box COX Implementation')
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
from statsmodels.stats.diagnostic import het_breuschpagan

# Assuming 'data' is your DataFrame
data = df.copy()

# Define the dependent and independent variables
dependent_variable = 'PRICE_NEW'

# Apply Box-Cox transformation to the dependent variable
y_transformed, lambda_value = stats.boxcox(data[dependent_variable])

print('*****lambda'+str(lambda_value)+'*****')
# Update the dependent variable in the DataFrame with the transformed values
data[dependent_variable] = y_transformed

# Separate the dependent and independent variables
y = data[dependent_variable]
X = data[columns_to_use]

# Add a constant term for the intercept
X_linear = sm.add_constant(X)

# Fit the linear regression model with the transformed dependent variable
print('***********************Box-Cox Transformed Summary*************************')
model_boxcox = sm.OLS(y, X_linear).fit()
# print(model_boxcox.summary())


predicted_boxcox = model_boxcox.predict(X_linear)

# Step 2: Transform predicted values back to original scale
predicted_original_scale = (np.power((predicted_boxcox * lambda_value) + 1, 1 / lambda_value))

# Step 3: Calculate residuals in original scale
residuals = data[dependent_variable] - predicted_original_scale

lm, lm_p_value, fvalue, f_p_value = het_breuschpagan(residuals, X_linear)

print("Breusch-Pagan test: Box Cox Transformation ")
print("LM Statistic:", lm)
print("LM Test p-value:", lm_p_value)
print("F-Statistic:", fvalue)
print("F-Test p-value:", f_p_value)

# Step 4: Calculate RMSE
rmse_boxcox = np.sqrt(np.mean(residuals**2))

print("RMSE for Box-Cox transformed model:", rmse_boxcox)

Box COX Implementation
*****lambda-0.08319755830379594*****
***********************Box-Cox Transformed Summary*************************
Breusch-Pagan test: Box Cox Transformation 
LM Statistic: 1889.6109239319037
LM Test p-value: 0.0
F-Statistic: 2059.180249660835
F-Test p-value: 0.0
RMSE for Box-Cox transformed model: 2057507.927277524


In [282]:

# WLS model for removing heteroskedasticity
import pandas as pd
import statsmodels.api as sm
import numpy as np
from statsmodels.stats.diagnostic import het_breuschpagan

pd.set_option('display.max_rows', None)

data = df.copy()
dependent_variable = 'PRICE_NEW'

data['AGE'] = data['AGE'].replace(0, 0.5);
df.head()

y = data[dependent_variable]
X = data[columns_to_use]


# Function to calculate weights based on squared residuals
def get_weights(resid):
    return 1 / np.square(resid)

# Run Semi-Log model (already in your code)
y_semi_log = np.log(y)
X_semi_log = sm.add_constant(X.copy())  # Add a constant term for the intercept
model_semi_log = sm.OLS(y_semi_log, X_semi_log).fit()
residuals = model_semi_log.resid

# Calculate weights based on squared residuals from Semi-Log model
weights = get_weights(residuals)

# Run WLS model with calculated weights
model_wls = sm.WLS(y_semi_log, X_semi_log, weights=weights).fit()
print('***********************WLS Summary*************************')
# print(model_wls.summary())

# Perform Breusch-Pagan test on WLS residuals
lm_wls, lm_wls_p_value, fvalue_wls, f_p_value_wls = het_breuschpagan(model_wls.resid, X_semi_log)

# Print WLS Breusch-Pagan test results
print("\nBreusch-Pagan test: WLS")
print("LM Statistic:", lm_wls)
print("LM Test p-value:", lm_wls_p_value)
print("F-Statistic:", fvalue_wls)
print("F-Test p-value:", f_p_value_wls)

***********************WLS Summary*************************

Breusch-Pagan test: WLS
LM Statistic: 206.20114446630475
LM Test p-value: 1.7079815123528634e-37
F-Statistic: 18.988877997475875
F-Test p-value: 1.2627982583130751e-39


In [283]:
# Robust model  and WLS for removing heteroskedasticity

import pandas as pd
import statsmodels.api as sm
import numpy as np
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.robust.robust_linear_model import RLM

pd.set_option('display.max_rows', None)

data = df.copy()
dependent_variable = 'PRICE_NEW'

data['AGE'] = data['AGE'].replace(0, 0.5)  # Handle missing age values

y = data[dependent_variable]
X = data[columns_to_use]


# Function to calculate weights based on squared residuals
def get_weights(resid):
    return 1 / np.square(resid)


# Run Semi-Log model (already in your code)
y_semi_log = np.log(y)
X_semi_log = sm.add_constant(X.copy())  # Add a constant term for the intercept

# Perform Breusch-Pagan test on Semi-Log model
lm, lm_p_value, fvalue, f_p_value = het_breuschpagan(residuals, X_semi_log)




# Run WLS model with calculated weights
weights = get_weights(residuals)
model_wls = sm.WLS(y_semi_log, X_semi_log, weights=weights).fit()


# Perform Breusch-Pagan test on WLS residuals
lm_wls, lm_wls_p_value, fvalue_wls, f_p_value_wls = het_breuschpagan(model_wls.resid, X_semi_log)



# Run Robust Regression (MM-estimator)
model_robust = RLM(y_semi_log, X_semi_log, M=sm.robust.norms.HuberT()).fit()
print('\n***********************Robust Regression Summary*************************')
# print(model_robust.summary())

# Perform Breusch-Pagan test on Robust Regression residuals
lm_robust, lm_robust_p_value, fvalue_robust, f_p_value_robust = het_breuschpagan(model_robust.resid, X_semi_log)

# Print Robust Regression Breusch-Pagan test results
print("\nBreusch-Pagan test: Robust Regression")
print("LM Statistic:", lm_robust)
print("LM Test p-value:", lm_robust_p_value)
print("F-Statistic:", fvalue_robust)
print("F-Test p-value:", f_p_value_robust)


***********************Robust Regression Summary*************************

Breusch-Pagan test: Robust Regression
LM Statistic: 204.07134862309087
LM Test p-value: 4.705909030328567e-37
F-Statistic: 18.771005423991173
F-Test p-value: 3.8885842406943385e-39


In [284]:
#Log Log model
import pandas as pd
import statsmodels.api as sm
import numpy as np
from statsmodels.stats.diagnostic import het_breuschpagan

pd.set_option('display.max_rows', None)

data=df.copy()
dependent_variable = 'PRICE_NEW'

data['AGE'] = data['AGE'].replace(0, 0.5);
df.head()
y = data[dependent_variable]
X = data[columns_to_use]

print('***********************Log Log Summary*************************')
X_log_log = X.copy()
X_log_log = sm.add_constant(X_log_log)  # Add a constant term for the intercept
y_log_log = np.log(y)

X_log_log[['KM_INT', 'AGE']] = np.log(X_log_log[['KM_INT', 'AGE']])  # Log-transform multiple independent variables
model_log_log = sm.OLS(y_log_log, X_log_log).fit()
# print(model_log_log.summary())
print('......Breisch-Pagan.......')

residuals = model_log_log.resid

# Perform Breusch-Pagan test
lml, lml_p_value, flvalue, fl_p_value = het_breuschpagan(residuals, X_log_log)


# Print the results
print("Breusch-Pagan test: Log log")
print("LM Statistic:", lml)
print("LM Test p-value:", lml_p_value)
print("F-Statistic:", flvalue)
print("F-Test p-value:", fl_p_value)

def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data


# For Log-Log Model
vif_log_log = calculate_vif(X_log_log)
print('\nVIF for Log-Log Model:')
print(vif_log_log)

***********************Log Log Summary*************************


KeyError: "None of [Index(['KM_INT', 'AGE'], dtype='object')] are in the [columns]"

In [None]:


# This is done to check the variables to be used as a predictor
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd


dfNewCopy = df.copy()
y = dfNewCopy['PRICE_NEW']

dfNewCopy.head()
# dfNewCopy = df.drop(columns=df.columns[0])

X=dfNewCopy.drop(['PRICE_NEW'],axis=1)
X = sm.add_constant(X)


# # print(X.info())
model = sm.OLS(y.astype(float), X.astype(float)).fit()
# print(model.fit())

# print(len(X))
# print(len(y))


# # Display the summary
# print(model.summary())

column_names = X.columns

# Extract the coefficients from the model
coefficients = model.params

# Create a string representation of the regression equation
equation = 'Y = {:.2f}'.format(coefficients['const'])

for i, coefficient in enumerate(coefficients[1:], start=1):
    equation += ' + {:.2f} * {}'.format(coefficient, column_names[i-1])

print('******************Equation of linear model of hedonic price is')
# Display the equation
print(equation)

# # Extracting p-values from the summary
# p_values = summary.tables[1]['P>|t|']
# print("P-values:")
# print(p_values)


******************Equation of linear model of hedonic price is
Y = 2509111.38 + 779.38 * const + -6.02 * Unnamed: 0 + -627379.27 * KM_INT + -295880.05 * FUEL + 491180.67 * GEARBOX + -134570.54 * DRIVE + -1523187.06 * AGE + -539424.68 * Ford + 364253.39 * Hyundai + -1480027.05 * Kia + 204462.77 * Mahindra + 644933.00 * Maruti Suzuki + 25092.99 * Nissan + -370465.87 * Renault + 3408660.07 * Tata + 1774813.81 * Toyota + 33259.74 * Volkswagen + 346657.97 * Black + 146034.86 * Blue + 308529.77 * Brown + 707719.60 * Gray + 151801.86 * Maroon + 163730.77 * Other Colors + 336246.57 * Red + 315130.24 * Silver + 894582.26 * White + -552141.19 * Hatchback + 1385667.45 * Pickup + 599333.27 * SUV + 181669.59 * Sedan + -913017.00 * Van + -769844.11 * 0-1000(cc) + -245297.80 * 1001-1500(cc) + 350656.36 * 1501-2000(cc) + 1497247.77 * 2001-2500(cc) + 2589366.15 * 2501-2900(cc)
