## PART-1

In [173]:
import pandas as pd
import numpy as np

In [174]:
df = pd.read_csv('Preprocessing3.csv')

In [175]:
df.shape

(10000, 12)

In [176]:
df.head()

Unnamed: 0,Date,Year,Locality,Estimated Value,Sale Price,Property,Residential,num_rooms,num_bathrooms,carpet_area,property_tax_rate,Face
0,2009-01-02,2009,Greenwich,,5187000.0,?,Detached House,3,2,1026.0,1.025953,South
1,2009-01-02,2009,Norwalk,,480000.0,Single Family,Detached House,3,2,1051.0,1.025953,West
2,2009-01-02,2009,Waterbury,57890.0,152000.0,Single Family,Detached House,3,2,943.0,1.025953,East
3,2009-01-02,2009,,44520.0,60000.0,Single Family,Detached House,3,2,1099.0,1.025953,North
4,2009-01-03,2009,Bridgeport,91071.0,250000.0,Two Family,Duplex,4,2,1213.0,1.025953,South


In [177]:
df['Locality'].unique()

array(['Greenwich', 'Norwalk', 'Waterbury', nan, 'Bridgeport',
       'Fairfield', 'West Hartford', 'Stamford'], dtype=object)

In [178]:
df.isnull().sum()

Date                    0
Year                    0
Locality             1285
Estimated Value      1281
Sale Price              0
Property                0
Residential             0
num_rooms               0
num_bathrooms           0
carpet_area          1282
property_tax_rate       0
Face                    0
dtype: int64

In [179]:
df.isin(['?']).sum()

Date                    0
Year                    0
Locality                0
Estimated Value         0
Sale Price              0
Property             1873
Residential             0
num_rooms               0
num_bathrooms           0
carpet_area             0
property_tax_rate       0
Face                    0
dtype: int64

In [180]:
cols = df.columns
for col in cols:
  unique_vals = df[col].unique()
  if '?' in unique_vals:
    print(col)

Property


In [181]:
df['Property'].unique()

array(['?', 'Single Family', 'Two Family', 'Three Family', 'Four Family'],
      dtype=object)

In [182]:
df[
    (df['Year'] == 2022) &
    (df['Locality'] == 'Greenwich') &
    (df['num_rooms'] >= 3) &
    ((df['Face'] == 'North') | (df['Face'] == 'East'))
].shape

(35, 12)

In [183]:
# Replacing "?" in the property with np.nan
df['Property'].replace('?', np.nan, inplace=True)

In [184]:
X_train = df[df['Year'] < 2021].drop(columns=['Sale Price'])
X_test = df[df['Year'] >= 2021].drop(columns=['Sale Price'])
y_train = df[df['Year'] < 2021]['Sale Price']
y_test = df[df['Year'] >= 2021]['Sale Price']

In [185]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8272, 11)
(1728, 11)
(8272,)
(1728,)


In [186]:
X_train.head(1)

Unnamed: 0,Date,Year,Locality,Estimated Value,Property,Residential,num_rooms,num_bathrooms,carpet_area,property_tax_rate,Face
0,2009-01-02,2009,Greenwich,,,Detached House,3,2,1026.0,1.025953,South


In [187]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [188]:
preprocessor = ColumnTransformer([
    ('locality', SimpleImputer(strategy='most_frequent'), ['Locality']),
    ('est_val', SimpleImputer(strategy='median'),['Estimated Value']),
    ('carpet', SimpleImputer(strategy='mean'),['carpet_area']),
    ('prop', SimpleImputer(strategy='most_frequent'), ['Property'])
], remainder='passthrough')

preprocessor.set_output(transform='pandas')

In [189]:
preprocessor.fit(X_train)

In [190]:
X_train[X_train['carpet_area'].isna()].head(1)

Unnamed: 0,Date,Year,Locality,Estimated Value,Property,Residential,num_rooms,num_bathrooms,carpet_area,property_tax_rate,Face
14,2009-01-05,2009,Bridgeport,97719.0,Two Family,Duplex,4,2,,1.025953,North


In [191]:
X_train_proc = preprocessor.transform(X_train)

In [192]:
X_train_proc.iloc[14]

locality__Locality               Bridgeport
est_val__Estimated Value            97719.0
carpet__carpet_area             1113.403351
prop__Property                   Two Family
remainder__Date                  2009-01-05
remainder__Year                        2009
remainder__Residential               Duplex
remainder__num_rooms                      4
remainder__num_bathrooms                  2
remainder__property_tax_rate       1.025953
remainder__Face                       North
Name: 14, dtype: object

In [193]:
X_train[X_train['Property'].isna()].head(1)

Unnamed: 0,Date,Year,Locality,Estimated Value,Property,Residential,num_rooms,num_bathrooms,carpet_area,property_tax_rate,Face
0,2009-01-02,2009,Greenwich,,,Detached House,3,2,1026.0,1.025953,South


In [194]:
X_train_proc.head(1)

Unnamed: 0,locality__Locality,est_val__Estimated Value,carpet__carpet_area,prop__Property,remainder__Date,remainder__Year,remainder__Residential,remainder__num_rooms,remainder__num_bathrooms,remainder__property_tax_rate,remainder__Face
0,Greenwich,230360.0,1026.0,Single Family,2009-01-02,2009,Detached House,3,2,1.025953,South


In [195]:
X_train_proc.dtypes

locality__Locality               object
est_val__Estimated Value        float64
carpet__carpet_area             float64
prop__Property                   object
remainder__Date                  object
remainder__Year                   int64
remainder__Residential           object
remainder__num_rooms              int64
remainder__num_bathrooms          int64
remainder__property_tax_rate    float64
remainder__Face                  object
dtype: object

In [196]:
X_train_proc.select_dtypes(['object']).columns

Index(['locality__Locality', 'prop__Property', 'remainder__Date',
       'remainder__Residential', 'remainder__Face'],
      dtype='object')

In [197]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
X_train.drop(columns=['Date','Year'], inplace=True)
preprocessor.fit(X_train)
X_train_proc = preprocessor.transform(X_train)

cat_cols = X_train_proc.select_dtypes(['object']).columns
num_cols = X_train_proc.select_dtypes([np.number]).columns
preprocessor_v2 = ColumnTransformer([
    ('scaling', MinMaxScaler(), num_cols),
    ('ohe', OneHotEncoder(sparse_output=False), cat_cols)
])

preprocessor_v2.set_output(transform='pandas')

In [198]:
preprocessor_v2.fit_transform(X_train_proc)

Unnamed: 0,scaling__est_val__Estimated Value,scaling__carpet__carpet_area,scaling__remainder__num_rooms,scaling__remainder__num_bathrooms,scaling__remainder__property_tax_rate,ohe__locality__Locality_Bridgeport,ohe__locality__Locality_Fairfield,ohe__locality__Locality_Greenwich,ohe__locality__Locality_Norwalk,ohe__locality__Locality_Stamford,...,ohe__prop__Property_Three Family,ohe__prop__Property_Two Family,ohe__remainder__Residential_Detached House,ohe__remainder__Residential_Duplex,ohe__remainder__Residential_Fourplex,ohe__remainder__Residential_Triplex,ohe__remainder__Face_East,ohe__remainder__Face_North,ohe__remainder__Face_South,ohe__remainder__Face_West
0,0.013548,0.060316,0.0,0.142857,0.013981,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.013548,0.072283,0.0,0.142857,0.013981,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.003405,0.020584,0.0,0.142857,0.013981,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.002618,0.095261,0.0,0.142857,0.013981,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.005356,0.149832,0.2,0.142857,0.013981,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8267,0.018278,0.043562,0.0,0.285714,0.012704,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8268,0.004174,0.057922,0.0,0.000000,0.012704,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8269,0.004977,0.102156,0.0,0.142857,0.012704,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8270,0.008910,0.102156,0.0,0.142857,0.012704,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## PART-2

In [199]:
import pandas as pd

In [209]:
"""
Split the dataset into train dataset and test dataset in the following manner :
- data (rows) index [0, 8271] should be the train dataset
- data (rows) index from 8272 till last row should be the test dataset
- columns except of the label(Sale Price) vector should be the feature matrix (X_train or X_test)
- make label vector (Y_train or y_test) containing values only from the target feature.
"""
df = pd.read_csv('Model_Building_1.csv')
X_train = df[0:8272].drop(columns=['Sale Price'])
X_test = df[8272:].drop(columns=['Sale Price'])
y_train = df['Sale Price'][0:8272]
y_test = df['Sale Price'][8272:]

In [210]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.6468929065748328

In [211]:
from sklearn.model_selection import cross_val_score

cross_val_score(LinearRegression(), X_train, y_train, cv=5)

array([0.63553989, 0.76800107, 0.74169609, 0.81473138, 0.66269374])

In [217]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

model = Ridge(random_state=27)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(mean_squared_error(y_test, y_pred))

model = Lasso(random_state=27)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(mean_squared_error(y_test, y_pred))

301994665976.23914
305853086563.8274


  model = cd_fast.enet_coordinate_descent(


In [218]:
from sklearn.linear_model import SGDRegressor

model = SGDRegressor(random_state=27, warm_start=True, max_iter=100)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)



0.5267937468412169

In [223]:
"""
create a pipeline of the PolynomialFeatures(interaction_only=True) as transformer and Lasso as an estimator.

Use GridSearchCV for tuning the hyperparameters of the created pipeline on training dataset.

Keep polynomial degree as : [1,2]
lasso alpha value to be taken as : [10,100,1000,10000]
scoring : neg_mean_absolute_error.
cv = 5
n_jobs = -1 (negative one) [it helps in using all the computational power to run this job]
"""
from sklearn.preprocessing import PolynomialFeatures
pipe = Pipeline([
    ('poly', PolynomialFeatures(interaction_only=True)),
    ('lasso', Lasso())
])

pg = {
    'poly__degree' : [1, 2],
    'lasso__alpha' : [10,100,1000,10000]
}

from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(pipe, param_grid=pg, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
clf.fit(X_train,y_train)

In [224]:
clf.best_params_

{'lasso__alpha': 1000, 'poly__degree': 1}

In [225]:
from sklearn.decomposition import PCA

pca = PCA(
  n_components= 13,
  svd_solver='full',
  whiten=True,
  random_state=27
)

pca.fit(X_train)

In [227]:
pca.explained_variance_ratio_.sum()

0.9704054622920403

In [230]:
pca_X_train = pca.transform(X_train)
from sklearn.linear_model import RidgeCV

model = RidgeCV(cv=5,alphas= [0.1,1,10,100,1000,10000])
model.fit(pca_X_train,y_train)

pca_X_test = pca.transform(X_test)
y_pred = model.predict(pca_X_test)
r2_score(y_test, y_pred)

0.2963684940402008

In [232]:
from sklearn.feature_selection import SelectKBest, f_regression

pipe = Pipeline([
    ('select10', SelectKBest(f_regression,k=10)),
    ('lasso', Lasso(random_state=27))
])

pipe.fit(X_train,y_train)

  model = cd_fast.enet_coordinate_descent(


In [239]:
type(pipe[0])

In [241]:
pipe[0].pvalues_.max()

0.841406848285681

In [242]:
pipe[0].scores_.max()

21322.291974025855

In [243]:
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

0.652032343957194