In [32]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score


In [4]:
# Linear Regression on Housing Prices Dataset 

In [73]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# columns to omit
df_train = df_train.drop(columns=['MiscFeature', 'PoolQC'])
df_test = df_test.drop(columns=['MiscFeature', 'PoolQC'])

SalePrice = df_train['SalePrice']

# replacing nans with missing
df_train[df_train.select_dtypes(include='object').columns] = df_train.select_dtypes(include='object').fillna("missing")
df_test[df_test.select_dtypes(include='object').columns] = df_test.select_dtypes(include='object').fillna("missing")

df_train['SalePrice'] <- np.log(df_train['SalePrice'])

# replace missing int values with mean of column
numeric_cols = df_train.select_dtypes(include='number').columns
df_train[numeric_cols] = df_train[numeric_cols].fillna(df_train[numeric_cols].mean())

numeric_cols = df_test.select_dtypes(include='number').columns
df_test[numeric_cols] = df_test[numeric_cols].fillna(df_test[numeric_cols].mean())

# one hot encode all variables (df_train)
categorical_columns = df_train.select_dtypes(include=['object']).columns.tolist()
encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(df_train[categorical_columns])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_t = pd.concat([df_train, one_hot_df], axis=1)
df_t = df_t.drop(categorical_columns, axis=1)

# one hot encode all variables (df_test)

categorical_columns = df_test.select_dtypes(include=['object']).columns.tolist()
encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(df_test[categorical_columns])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_te = pd.concat([df_test, one_hot_df], axis=1)
df_te = df_te.drop(categorical_columns, axis=1)

# select columns that only exist in both df_t and df_te
same_cols = df_t.columns.intersection(df_te.columns)

df_t = df_t[same_cols]
df_te = df_te[same_cols]

# add sale price data back to df_t dataset
df_t['Sale Price'] = SalePrice

df_train = df_t
df_test = df_te

In [60]:
# Linear Regression Model

In [79]:
regr = linear_model.LinearRegression()
regr.fit(df_train.iloc[:,0:278], df_train.iloc[:,278])

In [84]:
pred = regr.predict(df_test)
pred_train = regr.predict(df_train.iloc[:, 0:278])

pred

array([107582.87635029, 136718.75249608, 179662.32912909, ...,
       162145.91685126, 104754.56743079, 222029.42650147])

In [90]:
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(df_train.iloc[:,278], pred_train))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(df_train.iloc[:,278], pred_train))

Coefficients: 
 [ 1.54723269e-01 -1.20874203e+02 -7.03140863e+01  4.97669775e-01
  8.19039206e+03  4.67971027e+03  1.85415379e+02  9.08158667e+01
  2.10840767e+01  2.94044217e+00  3.98246893e+00 -2.47493009e+00
  4.44798645e+00  1.18557178e+01  2.93843289e+01 -8.86145459e+00
  3.23785841e+01  4.40482917e+03  1.94317023e+03  6.70529876e+03
  8.70118466e+02 -3.33300597e+03 -1.24364678e+04  2.33201772e+03
  4.50539329e+03 -8.46902526e+01  1.12129912e+04  3.07930277e-01
  1.50918201e+01  1.87401717e+00  1.01934729e+01  5.02139269e+01
  3.24355380e+01  3.70733112e+01 -1.01481219e+00 -3.94608483e+02
 -7.82835088e+02 -2.36361282e+04  1.14698347e+04  2.72075001e+03
  5.88573690e+03  3.55980654e+03 -1.39542251e+04  1.39542251e+04
  1.68306210e+02  7.86757329e+02 -9.55063538e+02  4.41328912e+03
  6.37383330e+03 -1.63502403e+04  5.56311786e+03 -8.09381679e+03
  5.41994739e+03 -1.38533338e+03  4.05920278e+03  6.05570874e+04
  4.21502778e+03  1.48297519e+04 -5.68587244e+03 -1.51885679e+04
  1.82966

In [89]:
df_train.iloc[:,278]

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: Sale Price, Length: 1460, dtype: int64

In [88]:
pred_train

array([207826.02270052, 205699.52443328, 208076.08302611, ...,
       272102.57641602, 151680.16740126, 138022.12285438])

In [93]:
# elastic net
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X = df_train.iloc[:,0:278]
y = df_train.iloc[:,278]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the ElasticNet model
elastic_net = ElasticNet(alpha=0.5, l1_ratio=0.7)

# Fit the model to the training data
elastic_net.fit(X_train, y_train)
print('Elastic Net model trained successfully.')

# Make predictions on the test data
y_pred = elastic_net.predict(X_test)
print('Predictions made on the test data.')

# Print the coefficients of the trained model
print('Elastic Net coefficients:')
print(elastic_net.coef_)

print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Elastic Net model trained successfully.
Predictions made on the test data.
Elastic Net coefficients:
[-2.57126171e+00 -1.89352263e+02 -1.86685530e+02  4.14614969e-01
  1.19310762e+04  4.61599581e+03  3.12396995e+02  1.57317809e+02
  2.50162704e+01  1.09344385e+01  6.69288544e+00  3.15833595e+00
  2.03428535e+00  3.91535045e+01  4.59793088e+01  3.35709205e+01
  5.97208068e+00  4.39117205e+03 -1.05123161e+03  1.82129679e+03
  5.93659607e+02 -2.75253919e+03 -1.78655099e+03  2.48740635e+03
  3.32418532e+03 -7.89075604e-01  5.61858555e+03  2.36826956e+01
  2.27867756e+01 -8.25515506e+00  1.38206911e+01  4.81643444e+01
  6.33834823e+01 -9.45198809e+00 -1.38182392e+00 -1.73547532e+02
 -2.47869508e+02 -3.47286555e+02  1.10118533e+03 -4.86649016e+01
  1.53926552e+03 -2.24216606e+03 -2.71198851e+02  2.71198851e+02
 -4.54750034e+02 -1.25371341e+02  5.82454709e+02  6.30751167e+02
  1.83302181e+03 -1.85475099e+03 -6.09021985e+02 -3.56386695e+03
  3.20430588e+03  8.90856535e+01  2.65808755e+02  2.70

  model = cd_fast.enet_coordinate_descent(
