In [29]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv('train_new.csv')
train_df.head()

Unnamed: 0,SalePrice,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Roof Style,Heating,Central Air,Electrical,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,159000,531363010,80.0,9605,Pave,SawyerW,1Fam,1Story,7,6,2007,Gable,GasA,Y,SBrkr,1,1,3,6,1218,Typ,0,0,2009,WD
1,271900,906203120,90.0,14684,Pave,SawyerW,1Fam,1Story,7,7,1990,Hip,GasA,Y,SBrkr,2,0,3,7,2196,Typ,0,0,2009,WD
2,137500,916176030,,14375,Pave,Timber,1Fam,SLvl,6,6,1958,Gable,GasA,Y,FuseA,1,0,3,7,1344,Typ,233,0,2009,COD
3,248500,528180130,48.0,6472,Pave,NridgHt,TwnhsE,1Story,9,5,2008,Hip,GasA,Y,SBrkr,2,0,2,6,1456,Typ,0,0,2009,WD
4,167000,528290030,61.0,9734,Pave,Gilbert,1Fam,SLvl,7,5,2004,Gable,GasA,Y,SBrkr,2,1,3,7,1374,Typ,0,0,2009,WD


In [3]:
test_df = pd.read_csv('test_new.csv')
test_df.head()

Unnamed: 0,PID,Lot Frontage,Lot Area,Street,Neighborhood,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Roof Style,Heating,Central Air,Electrical,Full Bath,Half Bath,Bedroom AbvGr,TotRms AbvGrd,Gr Liv Area,Functional,Screen Porch,Pool Area,Yr Sold,Sale Type
0,907135180,60,8070,Pave,CollgCr,1Fam,1Story,4,5,1994,Gable,GasA,Y,SBrkr,1,0,3,5,990,Typ,0,0,2007,WD
1,528181040,40,6792,Pave,NridgHt,TwnhsE,1Story,7,5,2005,Gable,GasA,Y,SBrkr,2,0,2,6,1368,Typ,0,0,2006,New
2,528175010,44,6371,Pave,NridgHt,TwnhsE,1Story,7,5,2009,Gable,GasA,Y,SBrkr,2,0,2,6,1358,Typ,0,0,2010,New
3,531379030,70,8304,Pave,SawyerW,1Fam,2Story,6,5,1997,Gable,GasA,Y,SBrkr,2,1,3,7,1837,Typ,0,0,2006,WD
4,923275090,37,6951,Pave,Mitchel,1Fam,1Story,5,5,1984,Gable,GasA,Y,SBrkr,1,0,3,5,923,Typ,0,0,2008,WD


In [60]:
train_df = train_df.dropna()
test_df = test_df.dropna()

In [61]:
train_df['LogSalePrice'] = np.log(train_df['SalePrice'])

In [62]:
X_train = train_df.drop(['LogSalePrice','SalePrice','PID'],axis=1)
y_train = train_df['LogSalePrice']

In [20]:
ct = ColumnTransformer(
    [
        ('standardize', StandardScaler(), make_column_selector(dtype_include=np.number)),
        ('dummify', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), make_column_selector(dtype_include=object))
    ],
    remainder= 'passthrough'
)

In [63]:
lr_pipline = Pipeline(
    [
        ('preprocessing', ct),
        ('regression', LinearRegression())
    ]
)

lr_model = lr_pipline.fit(X_train,y_train)

-cross_val_score(lr_pipline, X_train, y_train, cv = 5, scoring = "neg_root_mean_squared_error").mean().round(2)

7063705613.12

In [31]:
ridge_pipeline = Pipeline(
    [
        ('preprocessing', ct),
        ('ridge', Ridge())
    ]
)

alpha_values = [0.001, 0.01, 0.1, 1, 10]
alphas = {'ridge__alpha': alpha_values}

gscv = GridSearchCV(ridge_pipeline, alphas, cv=5, scoring='neg_root_mean_squared_error')
gscv.fit(X_train, y_train)


best_model_ridge = gscv.best_estimator_
best_lambda_ridge = gscv.best_params_['ridge__alpha']
print(f"Best lambda: {best_lambda_ridge}")
print(f"Best Model RMSE: {-gscv.best_score_}")

Best lambda: 0.1
Best Model RMSE: 0.15307762364212502
Best lambda: 0.1
Best Model RMSE: 0.15307762364212502


In [64]:
elastic_pipeline = Pipeline(
    [("preprocessing", ct),
     ("elastic", ElasticNet())]
)

params = {
    "elastic__alpha": [0.001, 0.01, 0.1, 1, 10],
    "elastic__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
}

gscv = GridSearchCV(elastic_pipeline, params, cv=5, scoring='neg_root_mean_squared_error')
gscv.fit(X_train, y_train)

best_elastic_model = gscv.best_estimator_
best_param_elastic = gscv.best_params_

best_score = gscv.best_score_

print(f"Best Parameters: {best_param_elastic}")
print(f"Best Model RMSE: {-gscv.best_score_}")

Best Parameters: {'elastic__alpha': 0.001, 'elastic__l1_ratio': 0.3}
Best Model RMSE: 0.15263411303401705
Best Parameters: {'elastic__alpha': 0.001, 'elastic__l1_ratio': 0.3}
Best Model RMSE: 0.15263411303401705


In [55]:
feature_names = best_elastic_model.named_steps['preprocessing'].get_feature_names_out()
elastic_coefs = best_elastic_model.named_steps['elastic'].coef_

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': elastic_coefs})
coef_df.sort_values(by = 'Coefficient', key = abs, ascending= False)

coefficients = dict(zip(feature_names, elastic_coefs))
sorted_coefficients = dict(sorted(coefficients.items(), key=lambda item: abs(item[1]), reverse=False))
sorted_coefficients

{'standardize__TotRms AbvGrd': 0.0,
 'dummify__Street_Grvl': -0.0,
 'dummify__Street_Pave': 0.0,
 'dummify__Neighborhood_Blmngtn': -0.0,
 'dummify__Neighborhood_Blueste': 0.0,
 'dummify__Neighborhood_Gilbert': -0.0,
 'dummify__Neighborhood_Mitchel': -0.0,
 'dummify__Neighborhood_NPkVill': -0.0,
 'dummify__Neighborhood_Sawyer': 0.0,
 'dummify__Bldg Type_TwnhsE': -0.0,
 'dummify__House Style_1.5Fin': 0.0,
 'dummify__House Style_1.5Unf': -0.0,
 'dummify__House Style_2.5Unf': -0.0,
 'dummify__Roof Style_Gambrel': 0.0,
 'dummify__Roof Style_Mansard': -0.0,
 'dummify__Roof Style_Shed': -0.0,
 'dummify__Heating_OthW': -0.0,
 'dummify__Electrical_FuseA': 0.0,
 'dummify__Electrical_FuseP': -0.0,
 'dummify__Electrical_Mix': -0.0,
 'dummify__Functional_Maj1': -0.0,
 'dummify__Functional_Mod': 0.0,
 'dummify__Functional_Sev': -0.0,
 'dummify__Sale Type_CWD': -0.0,
 'dummify__Sale Type_Con': 0.0,
 'dummify__Sale Type_ConLI': -0.0,
 'dummify__Sale Type_ConLw': 0.0,
 'dummify__Sale Type_Oth': -0.0,
 

In [66]:
X_test = test_df.drop(['PID'], axis=1)

In [69]:
elastic_pipeline = Pipeline(
    [("preprocessing", ct),
     ("elastic", ElasticNet(alpha=0.001, l1_ratio=0.3))]
)

elastic_model = elastic_pipeline.fit(X_train,y_train)

y_pred = elastic_model.predict(X_test)

y_pred = np.exp(y_pred)

final_results_elastic = pd.DataFrame({'PID':test_df['PID'],'SalePrice':y_pred})
final_results_elastic

Unnamed: 0,PID,SalePrice
0,907135180,125954.446641
1,528181040,219217.630869
2,528175010,218565.801246
3,531379030,185684.008740
4,923275090,128637.782359
...,...,...
600,528174060,179479.533565
601,903400180,172523.742471
602,903227150,129953.501486
603,909250070,161321.804700


In [70]:
final_results_elastic.to_csv('final_results_elastic.csv',index=False)