In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint


In [3]:
# Linear Regression on Housing Prices Dataset 

In [4]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# columns to omit
df_train = df_train.drop(columns=['MiscFeature', 'PoolQC'])
df_test = df_test.drop(columns=['MiscFeature', 'PoolQC'])

SalePrice = df_train['SalePrice']

# replacing nans with missing
df_train[df_train.select_dtypes(include='object').columns] = df_train.select_dtypes(include='object').fillna("missing")
df_test[df_test.select_dtypes(include='object').columns] = df_test.select_dtypes(include='object').fillna("missing")

df_train['SalePrice'] <- np.log(df_train['SalePrice'])

# replace missing int values with mean of column
numeric_cols = df_train.select_dtypes(include='number').columns
df_train[numeric_cols] = df_train[numeric_cols].fillna(df_train[numeric_cols].mean())

numeric_cols = df_test.select_dtypes(include='number').columns
df_test[numeric_cols] = df_test[numeric_cols].fillna(df_test[numeric_cols].mean())

# one hot encode all variables (df_train)
categorical_columns = df_train.select_dtypes(include=['object']).columns.tolist()
encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(df_train[categorical_columns])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_t = pd.concat([df_train, one_hot_df], axis=1)
df_t = df_t.drop(categorical_columns, axis=1)

# one hot encode all variables (df_test)

categorical_columns = df_test.select_dtypes(include=['object']).columns.tolist()
encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(df_test[categorical_columns])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_te = pd.concat([df_test, one_hot_df], axis=1)
df_te = df_te.drop(categorical_columns, axis=1)

# select columns that only exist in both df_t and df_te
same_cols = df_t.columns.intersection(df_te.columns)

df_t = df_t[same_cols]
df_te = df_te[same_cols]

# add sale price data back to df_t dataset
df_t['Sale Price'] = SalePrice

df_train = df_t
df_test = df_te

In [5]:
# Linear Regression Model

In [6]:
regr = linear_model.LinearRegression()
regr.fit(df_train.iloc[:,0:278], df_train.iloc[:,278])

In [7]:
pred = regr.predict(df_test)
pred_train = regr.predict(df_train.iloc[:, 0:278])

pred

array([107582.87635029, 136718.75249608, 179662.32912909, ...,
       162145.91685126, 104754.56743079, 222029.42650147])

In [8]:
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(df_train.iloc[:,278], pred_train))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(df_train.iloc[:,278], pred_train))

Coefficients: 
 [ 1.54723269e-01 -1.20874203e+02 -7.03140863e+01  4.97669775e-01
  8.19039206e+03  4.67971027e+03  1.85415379e+02  9.08158667e+01
  2.10840767e+01  2.94044217e+00  3.98246893e+00 -2.47493009e+00
  4.44798645e+00  1.18557178e+01  2.93843289e+01 -8.86145459e+00
  3.23785841e+01  4.40482917e+03  1.94317023e+03  6.70529876e+03
  8.70118466e+02 -3.33300597e+03 -1.24364678e+04  2.33201772e+03
  4.50539329e+03 -8.46902526e+01  1.12129912e+04  3.07930277e-01
  1.50918201e+01  1.87401717e+00  1.01934729e+01  5.02139269e+01
  3.24355380e+01  3.70733112e+01 -1.01481219e+00 -3.94608483e+02
 -7.82835088e+02 -2.36361282e+04  1.14698347e+04  2.72075001e+03
  5.88573690e+03  3.55980654e+03 -1.39542251e+04  1.39542251e+04
  1.68306210e+02  7.86757329e+02 -9.55063538e+02  4.41328912e+03
  6.37383330e+03 -1.63502403e+04  5.56311786e+03 -8.09381679e+03
  5.41994739e+03 -1.38533338e+03  4.05920278e+03  6.05570874e+04
  4.21502778e+03  1.48297519e+04 -5.68587244e+03 -1.51885679e+04
  1.82966

In [9]:
df_train.iloc[:,278]

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: Sale Price, Length: 1460, dtype: int64

In [10]:
pred_train

array([207826.02270052, 205699.52443328, 208076.08302611, ...,
       272102.57641602, 151680.16740126, 138022.12285438])

In [11]:
# elastic net
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X = df_train.iloc[:,0:278]
y = df_train.iloc[:,278]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the ElasticNet model
elastic_net = ElasticNet(alpha=0.5, l1_ratio=0.7)

# Fit the model to the training data
elastic_net.fit(X_train, y_train)
print('Elastic Net model trained successfully.')

# Make predictions on the test data
y_pred = elastic_net.predict(X_test)
print('Predictions made on the test data.')

# Print the coefficients of the trained model
print('Elastic Net coefficients:')
print(elastic_net.coef_)

print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Elastic Net model trained successfully.
Predictions made on the test data.
Elastic Net coefficients:
[-2.57126171e+00 -1.89352263e+02 -1.86685530e+02  4.14614969e-01
  1.19310762e+04  4.61599581e+03  3.12396995e+02  1.57317809e+02
  2.50162704e+01  1.09344385e+01  6.69288544e+00  3.15833595e+00
  2.03428535e+00  3.91535045e+01  4.59793088e+01  3.35709205e+01
  5.97208068e+00  4.39117205e+03 -1.05123161e+03  1.82129679e+03
  5.93659607e+02 -2.75253919e+03 -1.78655099e+03  2.48740635e+03
  3.32418532e+03 -7.89075604e-01  5.61858555e+03  2.36826956e+01
  2.27867756e+01 -8.25515506e+00  1.38206911e+01  4.81643444e+01
  6.33834823e+01 -9.45198809e+00 -1.38182392e+00 -1.73547532e+02
 -2.47869508e+02 -3.47286555e+02  1.10118533e+03 -4.86649016e+01
  1.53926552e+03 -2.24216606e+03 -2.71198851e+02  2.71198851e+02
 -4.54750034e+02 -1.25371341e+02  5.82454709e+02  6.30751167e+02
  1.83302181e+03 -1.85475099e+03 -6.09021985e+02 -3.56386695e+03
  3.20430588e+03  8.90856535e+01  2.65808755e+02  2.70

  model = cd_fast.enet_coordinate_descent(


In [12]:
# CV and paramter tuning
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import ElasticNet
from sklearn.metrics import make_scorer, mean_squared_error

X = df_train.iloc[:, 0:278]
y = df_train.iloc[:, 278]

# Define the ElasticNet model
elastic_net = ElasticNet()

# Define the parameter grid to search
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0],    # Regularization strength
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]  # Mix between L1 (Lasso) and L2 (Ridge)
}

# Set up k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric (negative MSE for regression tasks)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=elastic_net,
    param_grid=param_grid,
    cv=kf,
    scoring=scorer,
    verbose=1
)

# Perform the grid search
grid_search.fit(X, y)

# Print the best parameters and corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best MSE:", -grid_search.best_score_)  # Convert back to positive MSE

Fitting 5 folds for each of 25 candidates, totalling 125 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best Parameters: {'alpha': 0.1, 'l1_ratio': 0.9}
Best MSE: 1247779115.4690654


  model = cd_fast.enet_coordinate_descent(


In [25]:
# random forest
# split train data into train and test since this has sale price

In [26]:
df_train

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Sale Price
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,175000
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,210000
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,266500
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,142125


In [27]:
# Split the data into features (X) and target (y)
X = df_train.drop(columns = ['Sale Price'])
y = df_train['Sale Price']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [31]:
rf = RandomForestClassifier(n_estimators=1000, max_features = 30)
rf.fit(X_train, y_train)

In [32]:
y_pred = rf.predict(X_test)

In [33]:
y_pred

array([290000, 171000, 264132, 136500,  89500, 274000, 205000, 160000,
       163500, 207500, 136905, 302000, 178900, 153000, 157900, 178000,
       180500, 140000, 176000, 143000,  55993, 145000, 214000, 260000,
       207000, 252000, 140000, 157000,  79500, 113000, 215000, 171000,
       137500, 275000,  85000, 142600, 165000, 100000, 197900, 290000,
       475000,  91000, 135000,  81000, 187750, 160000, 135000, 236500,
       140000, 134500, 205000, 155000, 115000, 236000, 415298, 212900,
       115000, 117000, 128900,  37900, 130500, 276000, 142600, 119000,
       100000, 430000, 290000, 180000, 140000, 143000, 228500, 110000,
       227680, 180000, 195000, 205000, 155000, 318000, 172500, 203000,
       110000, 165000, 248900, 143000, 232000, 285000, 205000, 124000,
       129000, 136500, 175000, 165000, 153900, 190000, 135000, 260000,
       135000, 232000, 240000, 134432, 140000,  92900, 119000, 193500,
       201000, 446261, 118000, 123600, 290000, 188000, 129000, 232000,
      

In [30]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.017123287671232876


In [35]:
# XG Boost
import xgboost as xgb


In [36]:
X = df_train.drop(columns = ['Sale Price'])
y = df_train['Sale Price']

In [40]:
xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",  
    random_state=42,
    eta=0.1,                      
    max_depth=6,                 
    eval_metric="rmse",           
    n_estimators=1000,                    
)
xgb_model.fit(X, y)

y_pred = xgb_model.predict(X)

mse=mean_squared_error(y, y_pred)


In [41]:
mse

2806.0298648468433

In [None]:
xgb_model = xgb.XGBRegressor( )


In [1]:
# K nearest neighbors

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create a KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the model on the training data
knn.fit(X_train, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9666666666666667


In [3]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3