In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import warnings
warnings.filterwarnings('ignore')
url = "https://raw.githubusercontent.com/mtsilimos/url/main/houseprices.csv"

dataset = pd.read_csv(url)

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [2]:
dataset.dropna(axis=1, thresh = 700, inplace = True)

In [3]:
dataset = dataset.fillna(dataset.median(numeric_only=True))

In [None]:
dataset.head()

In [None]:
dataset.info()

In [4]:
numerical_cols = dataset.select_dtypes(exclude=['object']).columns.tolist()

In [5]:
dataset[numerical_cols] = np.log(dataset[numerical_cols])

In [6]:
categorical_cols = dataset.select_dtypes(include=['object']).columns.tolist()

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import numpy as np

In [8]:
#num_pipeline = make_pipeline(SimpleImputer(missing_values = np.nan, strategy='median'),)

cat_pipeline = make_pipeline(SimpleImputer(missing_values = np.nan, strategy='most_frequent'),)

In [9]:
combined_pipeline = ColumnTransformer([
    ('cat', cat_pipeline, categorical_cols)
], remainder = 'passthrough')


In [10]:
transformed_dataset = combined_pipeline.fit_transform(dataset)

In [11]:
feature_names = combined_pipeline.get_feature_names_out()
intermediate_dataset = pd.DataFrame(transformed_dataset, columns = feature_names)
intermediate_dataset.head()

Unnamed: 0,cat__MSZoning,cat__Street,cat__LotShape,cat__LandContour,cat__Utilities,cat__LotConfig,cat__LandSlope,cat__Neighborhood,cat__Condition1,cat__Condition2,cat__BldgType,cat__HouseStyle,cat__RoofStyle,cat__RoofMatl,cat__Exterior1st,cat__Exterior2nd,cat__ExterQual,cat__ExterCond,cat__Foundation,cat__BsmtQual,cat__BsmtCond,cat__BsmtExposure,cat__BsmtFinType1,cat__BsmtFinType2,cat__Heating,cat__HeatingQC,cat__CentralAir,cat__Electrical,cat__KitchenQual,cat__Functional,cat__FireplaceQu,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__SaleType,cat__SaleCondition,remainder__Id,remainder__MSSubClass,remainder__LotFrontage,remainder__LotArea,remainder__OverallQual,remainder__OverallCond,remainder__YearBuilt,remainder__YearRemodAdd,remainder__MasVnrArea,remainder__BsmtFinSF1,remainder__BsmtFinSF2,remainder__BsmtUnfSF,remainder__TotalBsmtSF,remainder__1stFlrSF,remainder__2ndFlrSF,remainder__LowQualFinSF,remainder__GrLivArea,remainder__BsmtFullBath,remainder__BsmtHalfBath,remainder__FullBath,remainder__HalfBath,remainder__BedroomAbvGr,remainder__KitchenAbvGr,remainder__TotRmsAbvGrd,remainder__Fireplaces,remainder__GarageYrBlt,remainder__GarageCars,remainder__GarageArea,remainder__WoodDeckSF,remainder__OpenPorchSF,remainder__EnclosedPorch,remainder__3SsnPorch,remainder__ScreenPorch,remainder__PoolArea,remainder__MiscVal,remainder__MoSold,remainder__YrSold,remainder__SalePrice
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Attchd,RFn,TA,TA,Y,WD,Normal,0.0,4.094345,4.174387,9.041922,1.94591,1.609438,7.602401,7.602401,5.278115,6.559615,-inf,5.010635,6.75227,6.75227,6.749931,-inf,7.444249,0.0,-inf,0.693147,0.0,1.098612,0.0,2.079442,-inf,7.602401,0.693147,6.306275,-inf,4.110874,-inf,-inf,-inf,-inf,-inf,0.693147,7.604894,12.247694
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,Gable,CompShg,MetalSd,MetalSd,TA,TA,CBlock,Gd,TA,Gd,ALQ,Unf,GasA,Ex,Y,SBrkr,TA,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal,0.693147,2.995732,4.382027,9.169518,1.791759,2.079442,7.58883,7.58883,-inf,6.88551,-inf,5.648974,7.140453,7.140453,-inf,-inf,7.140453,-inf,0.0,0.693147,-inf,1.098612,0.0,1.791759,0.0,7.58883,0.693147,6.131226,5.697093,-inf,-inf,-inf,-inf,-inf,-inf,1.609438,7.604396,12.109011
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,Gd,TA,PConc,Gd,TA,Mn,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal,1.098612,4.094345,4.219508,9.328123,1.94591,1.609438,7.601402,7.601902,5.087596,6.186209,-inf,6.073045,6.824374,6.824374,6.763885,-inf,7.487734,0.0,-inf,0.693147,0.0,1.098612,0.0,1.791759,0.0,7.601402,0.693147,6.410175,-inf,3.73767,-inf,-inf,-inf,-inf,-inf,2.197225,7.604894,12.317167
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,Gable,CompShg,Wd Sdng,Wd Shng,TA,TA,BrkTil,TA,Gd,No,ALQ,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,WD,Abnorml,1.386294,4.248495,4.094345,9.164296,1.94591,1.609438,7.557473,7.585789,-inf,5.375278,-inf,6.291569,6.628041,6.867974,6.628041,-inf,7.448334,0.0,-inf,0.0,-inf,1.098612,0.0,1.94591,0.0,7.599902,1.098612,6.464588,-inf,3.555348,5.605802,-inf,-inf,-inf,-inf,0.693147,7.603898,11.849398
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal,1.609438,4.094345,4.430817,9.565214,2.079442,1.609438,7.600902,7.600902,5.857933,6.484635,-inf,6.194405,7.04316,7.04316,6.959399,-inf,7.695303,0.0,-inf,0.693147,0.0,1.386294,0.0,2.197225,0.0,7.600902,1.098612,6.728629,5.257495,4.430817,-inf,-inf,-inf,-inf,-inf,2.484907,7.604894,12.429216


In [12]:
categorical_cols = intermediate_dataset.select_dtypes(include=['object']).columns.tolist()

In [13]:
!pip install category_encoders
from category_encoders import TargetEncoder
encoder = TargetEncoder()



In [14]:
from sklearn.model_selection import train_test_split

X = intermediate_dataset['remainder__OverallQual']#.drop(columns=['remainder__SalePrice'])
y = intermediate_dataset['remainder__SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [15]:
X_train = encoder.fit_transform(X_train, y_train)
X_test = encoder.transform(X_test)
X_train.head()

Unnamed: 0,remainder__OverallQual
1390,12.219105
1394,12.219105
574,11.775457
1280,12.219105
221,11.972268


In [None]:
from sklearn.linear_model import RidgeCV
ridge_cv = RidgeCV(alphas=[0.1, 0.001, 0.0001, 0.005, 1, 0.3, 0.5, 0.8, 10, 20, 50, 100, 150])

ridge_cv.fit(X_train, y_train)

ridge_cv.alpha_

In [None]:
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y_train)
y_pred= ridge.predict(X_test)
r2_score(y_test, y_pred)

#0.5013 score with ridgeCV(alpha 0.005) and OverallQual, GrLivArea, GarageCars, GarageArea as features (see previous exercises). with log and encoder to the specific object features
#0.69 score with ridgeCV(alpha 0.1) and OverallQual as only feature (see previous exercises). with log and encoder to the specific object features

In [16]:
#lasso cv
from sklearn.linear_model import LassoCV
lasso_cv = LassoCV(alphas=[0.1, 0.001, 0.0001, 0.005, 1, 0.3, 0.5, 0.8, 10, 20, 50, 100, 150])
lasso_cv.fit(X_train, y_train)

lasso_cv.alpha_

0.0001

In [17]:
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.0001)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
r2_score(y_test, y_pred_lasso)

#0.505 with lasso and all the features, alpha 0.0001
#0.5995 with lasso and only OverAllQual, alpha 0.0001

0.599510442459291