In [1]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

columns_to_use = ['KM_INT', 'FUEL','DRIVE','GEARBOX', 'AGE','Ford','Hyundai','Kia','Mahindra','Maruti Suzuki','Nissan','Tata','Toyota','Volkswagen','Black','Gray','Blue','Brown','Maroon','Red','Silver','Other Colors','Hatchback','Pickup','SUV','Sedan','1001-1500(cc)','1501-2000(cc)','2001-2500(cc)','2501-2900(cc)','2901-above(cc)']
# columns_to_use = [ 'FUEL','DRIVE','GEARBOX', 'AGE','Ford','Hyundai','Kia','Mahindra','Maruti Suzuki','Nissan','Tata','Toyota','Volkswagen','Black','Gray','Blue','Brown','Maroon','Red','Silver','Other Colors','Hatchback','Pickup','SUV','Sedan','1001-1500(cc)','1501-2000(cc)','2001-2500(cc)','2501-2900(cc)','2901-above(cc)']

df = pd.read_csv('Final-with-full-dummies_with_value_0and_1.csv');

In [2]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from statsmodels.stats.diagnostic import het_breuschpagan

pd.set_option('display.max_rows', None)

data=df.copy()
dependent_variable = 'PRICE_NEW'

data['AGE'] = data['AGE'].replace(0, 0.5);
df.head()
y = data[dependent_variable]
X = data[columns_to_use]


# print(model_linear.bse)


print('***********************Semi Log Summary*************************')
y_semi_log = np.log(y)
X_semi_log=X.copy()
X_semi_log = sm.add_constant(X_semi_log)  # Add a constant term for the intercept

model_semi_log = sm.OLS(y_semi_log, X_semi_log).fit()
print(model_semi_log.summary())
# print(model_semi_log.bse)
residuals = model_semi_log.resid

# Perform Breusch-Pagan test
lm, lm_p_value, fvalue, f_p_value = het_breuschpagan(residuals, X_semi_log)

# Print the results
print("Breusch-Pagan test: Semi log")
print("LM Statistic:", lm)
print("LM Test p-value:", lm_p_value)
print("F-Statistic:", fvalue)
print("F-Test p-value:", f_p_value)

print('***********************Log Log Summary*************************')
X_log_log = X.copy()
X_log_log = sm.add_constant(X_log_log)  # Add a constant term for the intercept
y_log_log = np.log(y)

X_log_log[['KM_INT', 'AGE']] = np.log(X_log_log[['KM_INT', 'AGE']])  # Log-transform multiple independent variables
model_log_log = sm.OLS(y_log_log, X_log_log).fit()
print(model_log_log.summary())
print('......Breisch-Pagan.......')

residuals = model_log_log.resid

# Perform Breusch-Pagan test
lm, lm_p_value, fvalue, f_p_value = het_breuschpagan(residuals, X_log_log)

print("Breusch-Pagan test: Semi log")
print("LM Statistic:", lm)
print("LM Test p-value:", lm_p_value)
print("F-Statistic:", fvalue)
print("F-Test p-value:", f_p_value)

def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data


# For Semi-Log Model
vif_semi_log = calculate_vif(X_semi_log)
print('\nVIF for Semi-Log Model:')
print(vif_semi_log)

# For Log-Log Model
#vif_log_log = calculate_vif(X_log_log)
#print('\nVIF for Log-Log Model:')
#print(vif_log_log)

***********************Semi Log Summary*************************
                            OLS Regression Results                            
Dep. Variable:              PRICE_NEW   R-squared:                       0.686
Model:                            OLS   Adj. R-squared:                  0.682
Method:                 Least Squares   F-statistic:                     142.2
Date:                Sun, 19 May 2024   Prob (F-statistic):               0.00
Time:                        21:06:01   Log-Likelihood:                -834.02
No. Observations:                2045   AIC:                             1732.
Df Residuals:                    2013   BIC:                             1912.
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------

In [3]:
print('Box COX Implementation')
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
from statsmodels.stats.diagnostic import het_breuschpagan

# Assuming 'data' is your DataFrame
data = df.copy()

# Define the dependent and independent variables
dependent_variable = 'PRICE_NEW'

# Apply Box-Cox transformation to the dependent variable
y_transformed, lambda_value = stats.boxcox(data[dependent_variable])

print('*****lambda'+str(lambda_value)+'*****')
# Update the dependent variable in the DataFrame with the transformed values
data[dependent_variable] = y_transformed

# Separate the dependent and independent variables
y = data[dependent_variable]
X = data[columns_to_use]

# Add a constant term for the intercept
X_linear = sm.add_constant(X)

# Fit the linear regression model with the transformed dependent variable
print('***********************Box-Cox Transformed Summary*************************')
model_boxcox = sm.OLS(y, X_linear).fit()
print(model_boxcox.summary())


predicted_boxcox = model_boxcox.predict(X_linear)

# Step 2: Transform predicted values back to original scale
predicted_original_scale = (np.power((predicted_boxcox * lambda_value) + 1, 1 / lambda_value))

# Step 3: Calculate residuals in original scale
residuals = data[dependent_variable] - predicted_original_scale

lm, lm_p_value, fvalue, f_p_value = het_breuschpagan(residuals, X_linear)

print("Breusch-Pagan test: Box Cox Transformation ")
print("LM Statistic:", lm)
print("LM Test p-value:", lm_p_value)
print("F-Statistic:", fvalue)
print("F-Test p-value:", f_p_value)

# Step 4: Calculate RMSE
rmse_boxcox = np.sqrt(np.mean(residuals**2))

print("RMSE for Box-Cox transformed model:", rmse_boxcox)

Box COX Implementation
*****lambda-0.08319755830379594*****
***********************Box-Cox Transformed Summary*************************
                            OLS Regression Results                            
Dep. Variable:              PRICE_NEW   R-squared:                       0.688
Model:                            OLS   Adj. R-squared:                  0.683
Method:                 Least Squares   F-statistic:                     143.2
Date:                Sun, 19 May 2024   Prob (F-statistic):               0.00
Time:                        21:06:02   Log-Likelihood:                 1638.2
No. Observations:                2045   AIC:                            -3212.
Df Residuals:                    2013   BIC:                            -3032.
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025    

In [4]:
#Running Linear regression 

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
df.head();
dfNewCopy = df.iloc[1:, :].copy()
# dfNewCopy = normalDf.drop(columns=normalDf.columns[0])
print("***********Running Linear-Linear Regression***********")

y=dfNewCopy['PRICE_NEW'];
X=dfNewCopy.drop(['PRICE_NEW'],axis=1);



X = X.iloc[1:, :]
y = y.iloc[1:]
# Now splitting training  Data and Test data (We use 80% of Data as training data and 20% data is Test data)

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)

# print(len(y_test))

model=LinearRegression();
model.fit(X_train,y_train)
r2=model.score(X_test,y_test)
y_pred = model.predict(X_test)

#Now Predicting for test data

print("R2 score is : ",r2)
# Calculate adjusted R^2
# adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))
# print('Adjusted r2 is : ',adjusted_r2)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', metrics.root_mean_squared_error(y_test, y_pred))

#computing r2
# y = y.values.reshape(-1, 1)
# y_pred = y_pred.values.reshape(-1, 1)
n = X_test.shape[0]
k = X_test.shape[1]



print("***********Running a Random Forest Regression***********")

#running a random forest Regressor to boost performance
from sklearn.ensemble import RandomForestRegressor
forest= RandomForestRegressor()
forest.fit(X_train, y_train)
newr2=forest.score(X_test,y_test)
adjusted_forest_r2 = 1 - ((1 - newr2) * (n - 1) / (n - k - 1))
print("The new R2 score after Random Forest Regression is : ",newr2)
print("The new Adjusted R2 score after Random Forest Regression is : ",adjusted_forest_r2)

***********Running Linear-Linear Regression***********
R2 score is :  0.5257389175033373
Mean Absolute Error: 816603.1551579435
Root Mean Squared Error: 1549917.9233744275
***********Running a Random Forest Regression***********
The new R2 score after Random Forest Regression is :  0.6425913262695878
The new Adjusted R2 score after Random Forest Regression is :  0.6080033901021286


In [5]:
#Running log regression 

from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
print("***********Running Log-Log Regression***********")

logDf = df.copy()

logDf['PRICE_NEW'] =np.log(logDf['PRICE_NEW']+1);
logDf['KM_INT'] =np.log(logDf['KM_INT']+1);
logDf['AGE'] =np.log(logDf['AGE']+1);

y=logDf['PRICE_NEW'];

# Now splitting training  Data and Test data (We use 80% of Data as training data and 20% data is Test data)
X=logDf.drop(['PRICE_NEW'],axis=1);

X = X.iloc[1:, :]
y = y.iloc[1:]
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)

# print(len(y_test))

model=LinearRegression();
model.fit(X_train,y_train)
r2=model.score(X_test,y_test)
y_pred = model.predict(X_test)

#Now Predicting for test data

#computing r2
# y = y.values.reshape(-1, 1)
# y_pred = y_pred.values.reshape(-1, 1)
n = X_test.shape[0]
k = X_test.shape[1]


print("R2 score is : ",r2)
# Calculate adjusted R^2
adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))
print('Adjusted r2 is : ',adjusted_r2)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', metrics.root_mean_squared_error(y_test, y_pred))


print("***********Running a Random Forest Regression***********")

#running a random forest Regressor to boost performance
from sklearn.ensemble import RandomForestRegressor
forest= RandomForestRegressor()
forest.fit(X_train, y_train)
newr2=forest.score(X_test,y_test)
adjusted_forest_r2 = 1 - ((1 - newr2) * (n - 1) / (n - k - 1))
print("The new R2 score after Random Forest Regression is : ",newr2)
print("The new Adjusted R2 score after Random Forest Regression is : ",adjusted_forest_r2)

***********Running Log-Log Regression***********
R2 score is :  0.6919976590657692
Adjusted r2 is :  0.6621909809108436
Mean Absolute Error: 0.25975167233728214
Root Mean Squared Error: 0.35293517424934057
***********Running a Random Forest Regression***********
The new R2 score after Random Forest Regression is :  0.7536927643101097
The new Adjusted R2 score after Random Forest Regression is :  0.729856580211088


In [6]:


# This is done to check the variables to be used as a predictor
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd


dfNewCopy = df.copy()
y = dfNewCopy['PRICE_NEW']

dfNewCopy.head()
# dfNewCopy = df.drop(columns=df.columns[0])

X=dfNewCopy.drop(['PRICE_NEW'],axis=1)
X = sm.add_constant(X)


# # print(X.info())
model = sm.OLS(y.astype(float), X.astype(float)).fit()
# print(model.fit())

# print(len(X))
# print(len(y))


# # Display the summary
# print(model.summary())

column_names = X.columns

# Extract the coefficients from the model
coefficients = model.params

# Create a string representation of the regression equation
equation = 'Y = {:.2f}'.format(coefficients['const'])

for i, coefficient in enumerate(coefficients[1:], start=1):
    equation += ' + {:.2f} * {}'.format(coefficient, column_names[i-1])

print('******************Equation of linear model of hedonic price is')
# Display the equation
print(equation)

# # Extracting p-values from the summary
# p_values = summary.tables[1]['P>|t|']
# print("P-values:")
# print(p_values)


******************Equation of linear model of hedonic price is
Y = 2509111.38 + 779.38 * const + -6.02 * Unnamed: 0 + -627379.27 * KM_INT + -295880.05 * FUEL + 491180.67 * GEARBOX + -134570.54 * DRIVE + -1523187.06 * AGE + -539424.68 * Ford + 364253.39 * Hyundai + -1480027.05 * Kia + 204462.77 * Mahindra + 644933.00 * Maruti Suzuki + 25092.99 * Nissan + -370465.87 * Renault + 3408660.07 * Tata + 1774813.81 * Toyota + 33259.74 * Volkswagen + 346657.97 * Black + 146034.86 * Blue + 308529.77 * Brown + 707719.60 * Gray + 151801.86 * Maroon + 163730.77 * Other Colors + 336246.57 * Red + 315130.24 * Silver + 894582.26 * White + -552141.19 * Hatchback + 1385667.45 * Pickup + 599333.27 * SUV + 181669.59 * Sedan + -913017.00 * Van + -769844.11 * 0-1000(cc) + -245297.80 * 1001-1500(cc) + 350656.36 * 1501-2000(cc) + 1497247.77 * 2001-2500(cc) + 2589366.15 * 2501-2900(cc)


In [7]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from statsmodels.stats.diagnostic import het_breuschpagan

pd.set_option('display.max_rows', None)

data=df.copy()
dependent_variable = 'PRICE_NEW'

data['AGE'] = data['AGE'].replace(0, 0.5);
df.head()
y = data[dependent_variable]
X = data[columns_to_use]

X_linear = sm.add_constant(X)  # Add a constant term for the intercept
print('***********************Linear Summary*************************')
model_linear = sm.OLS(y, X_linear).fit()
print(model_linear.summary())


def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

vif_linear = calculate_vif(X_linear)
print('VIF for Linear Model:')
print(vif_linear)

***********************Linear Summary*************************
                            OLS Regression Results                            
Dep. Variable:              PRICE_NEW   R-squared:                       0.485
Model:                            OLS   Adj. R-squared:                  0.477
Method:                 Least Squares   F-statistic:                     61.20
Date:                Sun, 19 May 2024   Prob (F-statistic):          1.86e-263
Time:                        21:06:04   Log-Likelihood:                -32177.
No. Observations:                2045   AIC:                         6.442e+04
Df Residuals:                    2013   BIC:                         6.460e+04
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------