In [136]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import SGDRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from scipy.stats import t

Importing preprocessed dataset

In [137]:
df=pd.read_csv(r"./preprocessed.csv",index_col=0)
df

Unnamed: 0,hasImage,has3DModel,hasAdditionalAttributions,marketingStatusSimplifiedCd_Pre-Foreclosure,marketingStatusSimplifiedCd_RecentChange,statusText_Sold,sgapt_Unknown Listed By,city_Charlotte,city_Chicago,city_Denver,...,homeType_LOT,homeType_MANUFACTURED,homeType_MULTI_FAMILY,homeType_SINGLE_FAMILY,homeType_TOWNHOUSE,beds,baths,area,rentZestimate,price
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,3.0,2.0,1224.0,3999.0,819500.0
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,723.0,3634.0,880000.0
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,6.0,3.0,3396.0,6000.0,1250000.0
3,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,4.0,3.0,2400.0,4964.0,805000.0
4,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,9.0,3.0,4107.0,8970.0,650000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6201,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,5.0,4.0,2366.0,4999.0,1125000.0
6202,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,3.0,3.0,1802.0,3449.0,465000.0
6203,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,4.0,4.0,2875.0,4749.0,1020000.0
6204,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,4.0,2.5,1260.0,3994.0,550000.0


In [138]:

def scale_selected_columns(x_train, x_test, cols):
    scaler = MinMaxScaler()
    x_train_scaled = x_train.copy()  
    x_test_scaled = x_test.copy()    
    scaler.fit(x_train_scaled[cols])  
    x_train_scaled[cols] = scaler.transform(x_train_scaled[cols])  
    x_test_scaled[cols] = scaler.transform(x_test_scaled[cols])
    return x_train_scaled, x_test_scaled

def log_transform(X, cols):
    X_transformed = X.copy()  
    
    X_transformed[cols] = np.log1p(X_transformed[cols])
        
    return X_transformed

def log_transform_y(y):
    return np.log1p(y.copy())

def remove_outliers(X,y,cols=[], threshold=3):
    z_scores = (X[cols] - X[cols].mean()) / X[cols].std()
    index=(np.abs(z_scores)<threshold).all(axis=1)
    return X[index],y[index]



Performing Linear Regression without any additional preprocessing

In [139]:
def perform_linear_regression(df):
    r2_list=[]
    
    for i in range(10):
        x = df.drop(columns=["price"])
        y = df["price"]
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)

        model = LinearRegression()
        model.fit(x_train, y_train)

        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)

        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list,model

    

results,model=perform_linear_regression(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)
results

Mean train_r2: 0.8215168477137613
Mean test_r2: 0.5285087824312887


[{'train_r2': 0.8283811543240924, 'test_r2': 0.3219007485161164},
 {'train_r2': 0.8128712051316874, 'test_r2': 0.6774026988291275},
 {'train_r2': 0.8099055835575415, 'test_r2': 0.700570172435975},
 {'train_r2': 0.8106551365848429, 'test_r2': 0.6192642176536693},
 {'train_r2': 0.8354067169689575, 'test_r2': 0.5648903937049126},
 {'train_r2': 0.8095942477215009, 'test_r2': 0.7082442730051932},
 {'train_r2': 0.8420417742388933, 'test_r2': 0.2052859929179598},
 {'train_r2': 0.819567223984051, 'test_r2': 0.5509709757568853},
 {'train_r2': 0.8153581668158918, 'test_r2': 0.5706454655195694},
 {'train_r2': 0.8313872678101533, 'test_r2': 0.3659128859734795}]

- The model is very overfit as there is a big difference between the r2score of the training data and the test data.
- Let's see if removing outliers makes a difference

In [140]:

def perform_linear_regression_zsor(df):
    r2_list=[]
    
    for i in range(100):
        x = df.drop(columns=["price"])
        y = df["price"]
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)

        x_train,y_train=remove_outliers(x_train,y_train,["beds","baths","rentZestimate","area"])

        model = LinearRegression()
        model.fit(x_train, y_train)

        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)

        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list,model

    

results,model=perform_linear_regression_zsor(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)
results


Mean train_r2: 0.7759773703986008
Mean test_r2: 0.8014381434684539


[{'train_r2': 0.7753797540234137, 'test_r2': 0.8087435418511384},
 {'train_r2': 0.7752521770080076, 'test_r2': 0.8153100235421703},
 {'train_r2': 0.7796733631322396, 'test_r2': 0.7854181087621367},
 {'train_r2': 0.7759183668892913, 'test_r2': 0.7872595077357105},
 {'train_r2': 0.7757856084137978, 'test_r2': 0.583457755784353},
 {'train_r2': 0.7778838418784515, 'test_r2': 0.8397982702436814},
 {'train_r2': 0.792272337835895, 'test_r2': 0.7454024009273823},
 {'train_r2': 0.782592704960802, 'test_r2': 0.7826146380527266},
 {'train_r2': 0.7784279032076962, 'test_r2': 0.7825608093534836},
 {'train_r2': 0.773664525750487, 'test_r2': 0.8258064851753775},
 {'train_r2': 0.7717645306593663, 'test_r2': 0.8423452769773196},
 {'train_r2': 0.7856444936829927, 'test_r2': 0.8347579281204691},
 {'train_r2': 0.7695414527850511, 'test_r2': 0.8864418076411102},
 {'train_r2': 0.773748893604929, 'test_r2': 0.7910203352587012},
 {'train_r2': 0.7832850315622552, 'test_r2': 0.8521986691812733},
 {'train_r2': 0

Already a big improvement! Let's try Min Max scaling

In [141]:

def perform_linear_regression_zsor_and_scaling(df):
    r2_list=[]
    
    for i in range(100):
        x = df.drop(columns=["price"])
        y = df["price"]
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)

        x_train,y_train=remove_outliers(x_train,y_train,["beds","baths","rentZestimate","area"])
        
        scaler = MinMaxScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)

        
        model = LinearRegression()
        model.fit(x_train, y_train)
        
        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)

        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list,model

    

results,model=perform_linear_regression_zsor_and_scaling(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)
results


Mean train_r2: 0.7728848601068046
Mean test_r2: -8.094112512648814e+20


[{'train_r2': 0.7742207398159969, 'test_r2': 0.806419328355898},
 {'train_r2': 0.7752481123365171, 'test_r2': -3.553194159602688e+17},
 {'train_r2': 0.7796454863609512, 'test_r2': -1.0497855545315685e+20},
 {'train_r2': 0.7759007201668493, 'test_r2': -2.8686603469731252e+16},
 {'train_r2': 0.7757809015252914, 'test_r2': 0.5838282123546338},
 {'train_r2': 0.7778825939569645, 'test_r2': 0.839724885831026},
 {'train_r2': 0.7918185292310373, 'test_r2': -2.13233540427199e+22},
 {'train_r2': 0.7823757593174921, 'test_r2': -9.186938530240392e+18},
 {'train_r2': 0.7783689977013787, 'test_r2': 0.7825257657310524},
 {'train_r2': 0.7650352456784548, 'test_r2': 0.8244875937615596},
 {'train_r2': 0.7717291956361538, 'test_r2': 0.842431838136769},
 {'train_r2': 0.785632316827781, 'test_r2': 0.8348171000211143},
 {'train_r2': 0.7683440249201126, 'test_r2': 0.8858204229418787},
 {'train_r2': 0.7737488811705472, 'test_r2': 0.7910202277532372},
 {'train_r2': 0.7830211046313627, 'test_r2': 0.852127822803

Scaling performs horribly our r2 score is below 0!!! Why though everybody says scaling is good?!?!

After lot of careful inspection and experimentation we found good results with SGDRegressor instead of LinearRegression

In [142]:

def perform_linear_regression_zsor_and_scaling(df):
    r2_list=[]
    
    for i in range(100):
        x = df.drop(columns=["price"])
        y = df["price"]
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)

        
        x_train,y_train=remove_outliers(x_train,y_train,["beds","baths","rentZestimate","area"])
        
        scaler = MinMaxScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        

        model = SGDRegressor()
        model.fit(x_train, y_train)
        
        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)

        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list,model

    

results,model=perform_linear_regression_zsor_and_scaling(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)
results


Mean train_r2: 0.7730323010670992
Mean test_r2: 0.7951996033102334


[{'train_r2': 0.7713000733094055, 'test_r2': 0.8051199537850164},
 {'train_r2': 0.7725300260512433, 'test_r2': 0.8033867876625347},
 {'train_r2': 0.7769133458044493, 'test_r2': 0.7802395519819922},
 {'train_r2': 0.7736193039674402, 'test_r2': 0.7866039638172061},
 {'train_r2': 0.772787229358471, 'test_r2': 0.5804127012177807},
 {'train_r2': 0.7753983964712275, 'test_r2': 0.8286235221983531},
 {'train_r2': 0.7906602227633375, 'test_r2': 0.746637390113121},
 {'train_r2': 0.7796543056496843, 'test_r2': 0.7819910596340015},
 {'train_r2': 0.7749366655123241, 'test_r2': 0.7901038310145472},
 {'train_r2': 0.7702203497263531, 'test_r2': 0.8164625151509911},
 {'train_r2': 0.7681537248384811, 'test_r2': 0.8396352875174785},
 {'train_r2': 0.7824398901559151, 'test_r2': 0.8247861871387392},
 {'train_r2': 0.7670020556062833, 'test_r2': 0.8766912815723326},
 {'train_r2': 0.7692572373984529, 'test_r2': 0.7797794006463852},
 {'train_r2': 0.7810984010428175, 'test_r2': 0.838396870506086},
 {'train_r2':

Lets try log transformation

In [143]:

def perform_linear_regression_zsor_and_logtrf(df):
    r2_list=[]
    
    for i in range(100):
        x = df.drop(columns=["price"])
        y = df["price"]
        
        y=log_transform_y(y)
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)

        

        x_train,y_train=remove_outliers(x_train,y_train,["beds","baths","rentZestimate","area"])

        model = LinearRegression()
        model.fit(x_train, y_train)
        
        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)

        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list,model

    

results,model=perform_linear_regression_zsor_and_logtrf(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)
results


Mean train_r2: 0.627510350324328
Mean test_r2: 0.41345637079529285


[{'train_r2': 0.5898185029576255, 'test_r2': 0.5288786219718933},
 {'train_r2': 0.6112269279898137, 'test_r2': 0.4806570585623954},
 {'train_r2': 0.602050054350872, 'test_r2': 0.6188113734237383},
 {'train_r2': 0.5940387424781451, 'test_r2': 0.656892125395111},
 {'train_r2': 0.6329658310034529, 'test_r2': -0.15339920432317933},
 {'train_r2': 0.6109278451144926, 'test_r2': 0.5048525990419082},
 {'train_r2': 0.639545729834634, 'test_r2': 0.42863228580595725},
 {'train_r2': 0.5879892971682942, 'test_r2': 0.6300690787703549},
 {'train_r2': 0.6135722078467761, 'test_r2': 0.5882195526259543},
 {'train_r2': 0.6143509427242766, 'test_r2': 0.5015122404833329},
 {'train_r2': 0.6019613784269152, 'test_r2': 0.5075444280247312},
 {'train_r2': 0.6315709790861717, 'test_r2': 0.4309339335572383},
 {'train_r2': 0.6443695846850912, 'test_r2': 0.38955891656348685},
 {'train_r2': 0.6292042962374501, 'test_r2': 0.5240277768177832},
 {'train_r2': 0.6576091131465842, 'test_r2': 0.38424588485753375},
 {'train

In [144]:

def perform_linear_regression_zsor_and_logtrf_minmax_scaling(df):
    r2_list=[]
    
    for i in range(100):
        x = df.drop(columns=["price"])
        y = df["price"]
        
        y=log_transform_y(y)
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)

    
        x_train,y_train=remove_outliers(x_train,y_train,["beds","baths","rentZestimate","area"])
        
        scaler = MinMaxScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)

        model = SGDRegressor()
        model.fit(x_train, y_train)
        
        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)

        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list,model

    

results,model=perform_linear_regression_zsor_and_logtrf_minmax_scaling(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)
results


Mean train_r2: 0.5659988352597959
Mean test_r2: 0.44748334077603175


[{'train_r2': 0.5233053542301487, 'test_r2': 0.49332573889677456},
 {'train_r2': 0.5601172991932583, 'test_r2': 0.4271267178337287},
 {'train_r2': 0.5392514655711568, 'test_r2': 0.654433015174483},
 {'train_r2': 0.5355439459800544, 'test_r2': 0.5592992878995993},
 {'train_r2': 0.568526981024684, 'test_r2': 0.1530573998546696},
 {'train_r2': 0.5558940810173761, 'test_r2': 0.5112946532827833},
 {'train_r2': 0.5767430053584535, 'test_r2': 0.4611982197192477},
 {'train_r2': 0.5292398767800379, 'test_r2': 0.6560235477057708},
 {'train_r2': 0.5481742133424637, 'test_r2': 0.5746611985584984},
 {'train_r2': 0.5543714346421895, 'test_r2': 0.49123272401125895},
 {'train_r2': 0.5513847791132396, 'test_r2': 0.5532468491655553},
 {'train_r2': 0.5645230940906587, 'test_r2': 0.4968855213518245},
 {'train_r2': 0.5826308342080619, 'test_r2': 0.36993038647328536},
 {'train_r2': 0.5634626897419615, 'test_r2': 0.47396755206387176},
 {'train_r2': 0.5855105481825181, 'test_r2': 0.4604028080996142},
 {'train

Log transform only seems to make our model worse :(

In [145]:
_,model=perform_linear_regression_zsor(df)
model.coef_

array([ 2.08470547e+04,  3.98973987e+04,  7.83801837e+04, -2.43772393e+05,
       -8.04009321e+03,  1.17106208e+04,  9.82601799e+03, -3.45557304e+05,
       -5.28924775e+05, -3.26227165e+05, -3.52530512e+05, -3.78255432e+05,
       -3.55799548e+05, -3.91180564e+05, -1.93343566e+05, -3.69802491e+05,
       -2.50491976e+05, -1.55913919e+05,  9.35361590e+04, -4.12304621e+04,
       -3.99738029e+05, -1.82410242e+05,  1.49022603e+05, -1.27864062e+03,
       -1.82410242e+05, -9.38643544e+04,  5.44930120e+03, -1.44489629e+05,
        2.72638215e+04, -5.63574105e+04, -1.93343566e+05, -3.20479661e+04,
       -4.12304621e+04, -7.58677286e+04,  1.89604427e+04, -6.48930441e+04,
        2.59799246e+04,  4.05941288e+04, -7.11886178e+04, -3.11510306e+04,
        3.18745505e+04,  3.62637008e+01,  2.19858360e+02])

Lets try removing unnecessary features using VIF

In [146]:
X=df.drop(columns=["price"])

X_with_const = add_constant(X)

vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(1, X_with_const.shape[1])]

print(vif_data)



  vif = 1. / (1. - r_squared_i)


                                        Feature         VIF
0                                      hasImage    1.810791
1                                    has3DModel    1.041078
2                     hasAdditionalAttributions    3.472165
3   marketingStatusSimplifiedCd_Pre-Foreclosure    1.024043
4      marketingStatusSimplifiedCd_RecentChange    2.955682
5                               statusText_Sold   85.470644
6                       sgapt_Unknown Listed By    1.914193
7                                city_Charlotte   50.549307
8                                  city_Chicago  258.709563
9                                   city_Denver  232.826493
10                            city_Indianapolis  157.670007
11                             city_Los Angeles    4.668515
12                               city_Nashville    8.523990
13                                  city_Others   15.254059
14                            city_Philadelphia         inf
15                                 city_

As we can see here some columns have extremely high VIF, we will remove the columns with vif one by one until there is no column with vif more than 10

In [147]:

def remove_high_vif_variables(X, threshold=10):
    removedCols=[]
    X_with_const = add_constant(X)
    
    vif = pd.Series([variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])], index=X_with_const.columns)
    
    vif = vif.drop("const")
    
    while vif.max() > threshold:
        max_vif_variable = vif.idxmax()
        removedCols.append(max_vif_variable)
        
        X = X.drop(columns=max_vif_variable)
        
        X_with_const = add_constant(X)
        vif = pd.Series([variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])], index=X_with_const.columns)
        
        vif = vif.drop("const")
    
    return X,removedCols


X=df.drop(columns=["price"])
X_no_multicollinearity,removedCols= remove_high_vif_variables(X, threshold=10)

df.drop(columns=removedCols,inplace=True)
X_no_multicollinearity

  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)


Unnamed: 0,hasImage,has3DModel,hasAdditionalAttributions,marketingStatusSimplifiedCd_Pre-Foreclosure,marketingStatusSimplifiedCd_RecentChange,statusText_Sold,sgapt_Unknown Listed By,city_Los Angeles,city_Nashville,city_Phoenix,...,state_WA,homeType_CONDO,homeType_LOT,homeType_MANUFACTURED,homeType_MULTI_FAMILY,homeType_TOWNHOUSE,beds,baths,area,rentZestimate
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,1224.0,3999.0
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,723.0,3634.0
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,6.0,3.0,3396.0,6000.0
3,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,4.0,3.0,2400.0,4964.0
4,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,9.0,3.0,4107.0,8970.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6201,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,4.0,2366.0,4999.0
6202,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,1802.0,3449.0
6203,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,2875.0,4749.0
6204,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.5,1260.0,3994.0


In [148]:
X=df.drop(columns=["price"])

X_with_const = add_constant(X)

vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(1, X_with_const.shape[1])]

print(vif_data)



                                        Feature       VIF
0                                      hasImage  1.806642
1                                    has3DModel  1.040874
2                     hasAdditionalAttributions  3.444204
3   marketingStatusSimplifiedCd_Pre-Foreclosure  1.022901
4      marketingStatusSimplifiedCd_RecentChange  2.899153
5                               statusText_Sold  1.738442
6                       sgapt_Unknown Listed By  1.910374
7                              city_Los Angeles  1.507241
8                                city_Nashville  3.326646
9                                  city_Phoenix  2.079707
10                               city_San Diego  2.055170
11                           city_San Francisco  2.110637
12                                city_San Jose  3.702165
13                                   city_Tampa  1.265359
14                                     state_CO  2.314713
15                                     state_DC  2.020352
16            

In [149]:
df.columns

Index(['hasImage', 'has3DModel', 'hasAdditionalAttributions',
       'marketingStatusSimplifiedCd_Pre-Foreclosure',
       'marketingStatusSimplifiedCd_RecentChange', 'statusText_Sold',
       'sgapt_Unknown Listed By', 'city_Los Angeles', 'city_Nashville',
       'city_Phoenix', 'city_San Diego', 'city_San Francisco', 'city_San Jose',
       'city_Tampa', 'state_CO', 'state_DC', 'state_FL', 'state_IN',
       'state_MA', 'state_NC', 'state_PA', 'state_TN', 'state_WA',
       'homeType_CONDO', 'homeType_LOT', 'homeType_MANUFACTURED',
       'homeType_MULTI_FAMILY', 'homeType_TOWNHOUSE', 'beds', 'baths', 'area',
       'rentZestimate', 'price'],
      dtype='object')

In [150]:

def perform_linear_regression_zsor(df):
    r2_list=[]
    
    for i in range(100):
        x = df.drop(columns=["price"])
        y = df["price"]
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)

        x_train,y_train=remove_outliers(x_train,y_train,["beds","baths","rentZestimate","area"])


        model = LinearRegression()
        model.fit(x_train, y_train)

        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)

        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list,model

    

results,model=perform_linear_regression_zsor(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)
results


Mean train_r2: 0.7727952362167692
Mean test_r2: 0.7999869154329511


[{'train_r2': 0.772029664901891, 'test_r2': 0.8072697814861213},
 {'train_r2': 0.7725249865830363, 'test_r2': 0.8124108362916929},
 {'train_r2': 0.7760471860906726, 'test_r2': 0.7835334502823998},
 {'train_r2': 0.7731565045443213, 'test_r2': 0.783067065727877},
 {'train_r2': 0.7729936136096156, 'test_r2': 0.5850854941923768},
 {'train_r2': 0.7755032818655205, 'test_r2': 0.8390413025926453},
 {'train_r2': 0.789261985566802, 'test_r2': 0.7415930067198456},
 {'train_r2': 0.7785904227940799, 'test_r2': 0.7830800462980623},
 {'train_r2': 0.7755900598462023, 'test_r2': 0.7776175308753335},
 {'train_r2': 0.7704765385357503, 'test_r2': 0.825252319795701},
 {'train_r2': 0.768284942038252, 'test_r2': 0.8411670822478685},
 {'train_r2': 0.7817324453336414, 'test_r2': 0.8332953147244093},
 {'train_r2': 0.766370594998458, 'test_r2': 0.8852192633670425},
 {'train_r2': 0.7706104973517269, 'test_r2': 0.7866112357061877},
 {'train_r2': 0.7795972813473709, 'test_r2': 0.8517012238587043},
 {'train_r2': 0.

the difference in r2 score is negligible after droping those cols

In [151]:

coefficients = model.coef_
column_names = df.drop(columns='price').columns

coefficients_with_names = list(zip(column_names, coefficients))

sorted_coefficients = sorted(coefficients_with_names, key=lambda x: x[1], reverse=True)

sorted_coefficients_df = pd.DataFrame(sorted_coefficients, columns=['Feature', 'Coefficient'])

sorted_coefficients

[('city_San Jose', 523742.7300303789),
 ('city_San Francisco', 262534.66724789946),
 ('statusText_Sold', 246636.99330273384),
 ('state_WA', 202832.45125670132),
 ('city_San Diego', 175679.39273800724),
 ('hasAdditionalAttributions', 76871.2455271921),
 ('city_Los Angeles', 45418.93915605362),
 ('has3DModel', 40810.414217302015),
 ('baths', 32182.13460872042),
 ('city_Nashville', 29496.13544380288),
 ('hasImage', 26576.76818229022),
 ('state_MA', 16016.302817516193),
 ('sgapt_Unknown Listed By', 7918.8416849449495),
 ('rentZestimate', 225.31251499744394),
 ('area', 31.75781851620559),
 ('city_Tampa', -8523.772624111842),
 ('marketingStatusSimplifiedCd_RecentChange', -14017.669278426201),
 ('homeType_MULTI_FAMILY', -14061.461139354185),
 ('homeType_LOT', -17839.250522746104),
 ('beds', -33566.017739126466),
 ('state_CO', -40919.88126568572),
 ('city_Phoenix', -78883.58968144239),
 ('state_DC', -82116.02255210973),
 ('state_PA', -93297.61719933544),
 ('homeType_MANUFACTURED', -97414.01939

In [152]:
x=df.drop(columns=["price"])
y=df["price"]
sse = np.sum((y-model.predict(x))**2)  
n=y.shape[0]
sigma2hat=sse/(n-x.shape[1]-1)
X=np.hstack([np.ones((x.shape[0],1)),np.array(x)])
betaCovMatrix=sigma2hat*np.linalg.inv(np.matmul(X.T,X))
betaVars=np.diag(betaCovMatrix)
betaStds=np.sqrt(betaVars)
betaStds

array([9.10346973e+04, 6.89840422e+04, 4.64854734e+04, 5.90519678e+04,
       3.64955864e+05, 4.72890686e+04, 7.35334713e+04, 3.17221818e+04,
       8.13868780e+04, 9.19524386e+04, 6.57081671e+04, 6.80711191e+04,
       7.10092531e+04, 8.14790053e+04, 1.06697862e+05, 6.33886648e+04,
       6.83696494e+04, 6.96352244e+04, 6.44852198e+04, 7.23836037e+04,
       6.39235563e+04, 6.69991478e+04, 9.22008450e+04, 6.78651604e+04,
       4.03228161e+04, 8.93461831e+04, 1.42378494e+05, 5.41569932e+04,
       3.69027568e+04, 1.18011621e+04, 1.33982817e+04, 8.51945594e+00,
       4.96503708e+00])

In [153]:
coefficients=model.coef_
ts=coefficients/betaStds[1:]
ts

array([ 0.38525965,  0.87791758,  1.30175587, -0.61360801, -0.29642515,
        3.35407793,  0.24963105,  0.55806219,  0.320776  , -1.20051423,
        2.58082128,  3.69718953,  6.42794703, -0.079887  , -0.64553941,
       -1.20105958, -2.78431768, -3.08655659,  0.22126976, -1.73172953,
       -1.39251946, -1.39166118,  2.98875668, -2.60645126, -0.19966438,
       -0.68419054, -0.25964257, -2.95947189, -2.84429767,  2.40195984,
        3.72768152, 45.37982526])

In [154]:
x=df.drop(columns=["price"])
n=x.shape[0]
p=x.shape[1]+1
tCrit=t.ppf(0.995,n-p)

significant_indices = np.where(abs(ts) > tCrit)[0]
nonSignificant_indices=np.where(abs(ts)<=tCrit)[0]
print("Statistically significant coefficients:")
for index in significant_indices:
    print(f"{x.columns[index]}: {ts[index]}")
print()
print("Statistically insignificant coefficients:")
for index in nonSignificant_indices:
    print(f"{x.columns[index]}: {ts[index]}")

Statistically significant coefficients:
statusText_Sold: 3.3540779297780814
city_San Diego: 2.5808212808542166
city_San Francisco: 3.6971895314688292
city_San Jose: 6.427947026500288
state_FL: -2.7843176758021024
state_IN: -3.086556592872075
state_WA: 2.9887566750143977
homeType_CONDO: -2.6064512617129614
homeType_TOWNHOUSE: -2.9594718850855046
beds: -2.8442976683419197
area: 3.727681524179525
rentZestimate: 45.37982525788746

Statistically insignificant coefficients:
hasImage: 0.38525965346744456
has3DModel: 0.8779175774401475
hasAdditionalAttributions: 1.3017558657593924
marketingStatusSimplifiedCd_Pre-Foreclosure: -0.6136080149634391
marketingStatusSimplifiedCd_RecentChange: -0.2964251505705779
sgapt_Unknown Listed By: 0.24963105416061626
city_Los Angeles: 0.5580621872469358
city_Nashville: 0.32077600012047464
city_Phoenix: -1.2005142314551238
city_Tampa: -0.07988700472323028
state_CO: -0.6455394099748349
state_DC: -1.2010595823472519
state_MA: 0.22126976282236938
state_NC: -1.73172

Clearly evident that rent zestimate is carrying this model on its back. Shall use statistically significant features in model

In [160]:
significant_cols=df.drop(columns=["price"]).columns[significant_indices]
significant_cols

Index(['statusText_Sold', 'city_San Diego', 'city_San Francisco',
       'city_San Jose', 'state_FL', 'state_IN', 'state_WA', 'homeType_CONDO',
       'homeType_TOWNHOUSE', 'beds', 'area', 'rentZestimate'],
      dtype='object')

In [162]:

def perform_linear_regression_zsor(df):
    r2_list=[]
    
    for i in range(100):
        x = df.drop(columns=["price"]+list(df.columns[nonSignificant_indices]))
        y = df["price"]
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)

        x_train,y_train=remove_outliers(x_train,y_train,["beds","rentZestimate","area"])


        model = LinearRegression()
        model.fit(x_train, y_train)

        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)

        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list,model

    

results,model=perform_linear_regression_zsor(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)
results


Mean train_r2: 0.765878011863917
Mean test_r2: 0.8002248572266057


[{'train_r2': 0.7644181151357226, 'test_r2': 0.8028882948796543},
 {'train_r2': 0.7646084067718482, 'test_r2': 0.8141267668335112},
 {'train_r2': 0.7691375411770236, 'test_r2': 0.7818906062171975},
 {'train_r2': 0.7658670997968349, 'test_r2': 0.7813790837255921},
 {'train_r2': 0.7669752730404094, 'test_r2': 0.5994877809783632},
 {'train_r2': 0.7677497793755994, 'test_r2': 0.8406727894427117},
 {'train_r2': 0.7870667535931839, 'test_r2': 0.7359464411331709},
 {'train_r2': 0.7701926584549922, 'test_r2': 0.780555157513342},
 {'train_r2': 0.7738348757534843, 'test_r2': 0.7648378249579098},
 {'train_r2': 0.7623729494651881, 'test_r2': 0.8198367973887094},
 {'train_r2': 0.7607600419943779, 'test_r2': 0.8380963633019372},
 {'train_r2': 0.7785761277138792, 'test_r2': 0.8334477442824038},
 {'train_r2': 0.7637549181594578, 'test_r2': 0.8846871506362135},
 {'train_r2': 0.7615383009776838, 'test_r2': 0.786126494799349},
 {'train_r2': 0.7758142262269216, 'test_r2': 0.8588216499747764},
 {'train_r2'