In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split,cross_val_score

In [2]:
df=pd.read_csv(r"./preprocessed.csv",index_col=0)
df

Unnamed: 0,hasImage,has3DModel,hasAdditionalAttributions,marketingStatusSimplifiedCd_Pre-Foreclosure,marketingStatusSimplifiedCd_RecentChange,statusText_Sold,sgapt_Unknown Listed By,city_Charlotte,city_Chicago,city_Denver,...,homeType_LOT,homeType_MANUFACTURED,homeType_MULTI_FAMILY,homeType_SINGLE_FAMILY,homeType_TOWNHOUSE,beds,baths,area,rentZestimate,price
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,3.0,2.0,1224.0,3999.0,819500.0
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,723.0,3634.0,880000.0
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,6.0,3.0,3396.0,6000.0,1250000.0
3,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,4.0,3.0,2400.0,4964.0,805000.0
4,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,9.0,3.0,4107.0,8970.0,650000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6201,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,5.0,4.0,2366.0,4999.0,1125000.0
6202,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,3.0,3.0,1802.0,3449.0,465000.0
6203,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,4.0,4.0,2875.0,4749.0,1020000.0
6204,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,4.0,2.5,1260.0,3994.0,550000.0


In [3]:

def scale_selected_columns(x_train, x_test, cols):
    scaler = MinMaxScaler()
    x_train_scaled = x_train.copy()  
    x_test_scaled = x_test.copy()    
    scaler.fit(x_train_scaled[cols])  
    x_train_scaled[cols] = scaler.transform(x_train_scaled[cols])  
    x_test_scaled[cols] = scaler.transform(x_test_scaled[cols])
    return x_train_scaled, x_test_scaled

def log_transform(X, cols):
    X_transformed = X.copy()  
    
    X_transformed[cols] = np.log1p(X_transformed[cols])
        
    return X_transformed

def remove_outliers(X,y,cols=[], threshold=3):
    z_scores = (X[cols] - X[cols].mean()) / X[cols].std()
    index=(np.abs(z_scores)<threshold).all(axis=1)
    return X[index],y[index]



In [4]:
def perform_linear_regression(df):
    r2_list=[]
    
    for i in range(10):
        x = df.drop(columns=["price"])
        y = df["price"]
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)

        model = LinearRegression()
        model.fit(x_train, y_train)

        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)

        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list

    

results=perform_linear_regression(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)

Mean train_r2: 0.8215168477137613
Mean test_r2: 0.5285087824312887


- The model is very overfit as there is a big difference between the r2score of the training data and the test data.
- Let's see if removing outliers makes a difference

In [5]:

def perform_linear_regression_zsor(df):
    r2_list=[]
    
    for i in range(10):
        x = df.drop(columns=["price"])
        y = df["price"]
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)

        x_train,y_train=remove_outliers(x_train,y_train,["beds","baths","rentZestimate","area"])

        model = LinearRegression()
        model.fit(x_train, y_train)

        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)

        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list

    

results=perform_linear_regression_zsor(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)


Mean train_r2: 0.7786850583100081
Mean test_r2: 0.7756371541428161


Already a big improvement! Let's try Min Max scaling

In [6]:

def perform_linear_regression_zsor_and_scaling(df):
    r2_list=[]
    
    for i in range(10):
        x = df.drop(columns=["price"])
        y = df["price"]
        scaler = MinMaxScaler()
        x=scaler.fit_transform(x)
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)

        x_train,y_train=remove_outliers(x_train,y_train,["beds","baths","rentZestimate","area"])

        
        model = LinearRegression()
        model.fit(x_train, y_train)
        
        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)

        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list

    

results=perform_linear_regression_zsor_and_scaling(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

Scaling performs horribly our r2 score is below 0!!! Why though everybody says scaling is good?!?!

In [8]:

def perform_linear_regression_zsor_and_scaling(df):
    r2_list=[]
    
    for i in range(10):
        x = df.drop(columns=["price"])
        y = df["price"]
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        x_train=x_train*10
        x_test=x_test*10
        x_train,y_train=remove_outliers(x_train,y_train,["beds","baths","rentZestimate","area"])

        model = LinearRegression()
        model.fit(x_train, y_train)

        train_predictions = model.predict(x_train)
        test_predictions = model.predict(x_test)

        train_r2 = r2_score(y_train, train_predictions)
        test_r2 = r2_score(y_test, test_predictions)
        r2_list.append({"train_r2":train_r2,"test_r2":test_r2})

    return r2_list

    

results=perform_linear_regression_zsor_and_scaling(df)
results

mean_train_r2 = sum(entry['train_r2'] for entry in results) / len(results)
mean_test_r2 = sum(entry['test_r2'] for entry in results) / len(results)

print("Mean train_r2:", mean_train_r2)
print("Mean test_r2:", mean_test_r2)


Mean train_r2: 0.7786850583100081
Mean test_r2: 0.7756371541428171
