In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import get_scorer_names, confusion_matrix, ConfusionMatrixDisplay, classification_report, mean_squared_error
import warnings
from sklearn.impute import SimpleImputer

In [148]:
train = pd.read_csv("CAH-201803-train.csv")

In [3]:
train.head()

Unnamed: 0,id_num,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id_num                 169 non-null    int64 
 1   Q1                     169 non-null    object
 2   Q2                     169 non-null    int64 
 3   political_affiliation  169 non-null    object
 4   Q4                     169 non-null    object
 5   Q5                     169 non-null    object
 6   Q6                     169 non-null    object
 7   Q7                     169 non-null    object
 8   Q8                     169 non-null    object
 9   Q9                     169 non-null    object
 10  Q10                    169 non-null    object
 11  Q11                    169 non-null    object
 12  Q12                    169 non-null    object
 13  Q13                    169 non-null    object
 14  Q14                    169 non-null    object
 15  Q15                    

In [152]:
train['Q1'] = train['Q1'].astype('category')
train['Q4'] = train['Q4'].astype('category')
train['Q5'] = train['Q5'].astype('category')
train['Q6'] = train['Q6'].astype('category')
train['Q7'] = train['Q7'].astype('category')
train['Q8'] = train['Q8'].astype('category')
train['Q9'] = train['Q9'].astype('category')
train['Q10'] = train['Q10'].astype('category')
train['Q11'] = train['Q11'].astype('category')
train['Q12'] = train['Q12'].astype('category')
train['Q13'] = train['Q13'].astype('category')
train['Q14'] = train['Q14'].astype('category')
train['Q18'] = train['Q18'].astype('category')

In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   id_num                 169 non-null    int64   
 1   Q1                     169 non-null    category
 2   Q2                     169 non-null    int64   
 3   political_affiliation  169 non-null    object  
 4   Q4                     169 non-null    category
 5   Q5                     169 non-null    category
 6   Q6                     169 non-null    category
 7   Q7                     169 non-null    category
 8   Q8                     169 non-null    category
 9   Q9                     169 non-null    category
 10  Q10                    169 non-null    category
 11  Q11                    169 non-null    category
 12  Q12                    169 non-null    category
 13  Q13                    169 non-null    category
 14  Q14                    169 non-null    cat

In [153]:
X = train.drop(['political_affiliation'], axis = 1)
y = train['political_affiliation']

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop= "first"), make_column_selector(dtype_include='category')),
    ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

In [155]:


logistic_pipeline = Pipeline([
    ('preprocessing', ct),
    ('logistic', LogisticRegression())
])

log_param_grid = {
    'logistic__C': np.logspace(-4, 4, 20),
    'logistic__penalty': ['l1', 'l2', 'elasticnet', 'none'],  
    'logistic__solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'], 
    'logistic__class_weight': [None, 'balanced'],  
    'logistic__multi_class': ['ovr', 'multinomial','ovo']
}

log_reg_grid_search = GridSearchCV(logistic_pipeline, log_param_grid, cv=10, scoring='accuracy')

log_reg_grid_search.fit(X, y)

best_log_reg_model = log_reg_grid_search.best_estimator_
y_pred_log_reg = best_log_reg_model.predict(X)
y_proba_log_reg = best_log_reg_model.predict_proba(X)[:, 1]

print("Best Logistic Regression Parameters:", log_reg_grid_search.best_params_)

Best Logistic Regression Parameters: {'logistic__C': 1.623776739188721, 'logistic__class_weight': None, 'logistic__multi_class': 'ovr', 'logistic__penalty': 'l1', 'logistic__solver': 'liblinear'}
Best Logistic Regression Parameters: {'logistic__C': 1.623776739188721, 'logistic__class_weight': None, 'logistic__multi_class': 'ovr', 'logistic__penalty': 'l1', 'logistic__solver': 'liblinear'}


In [157]:


logistic_pipeline = Pipeline([
    ('preprocessing', ct),
    ('logistic', LogisticRegression())
])

log_param_grid = {
    'logistic__C': [0.01, 0.1, 1, 10, 100],
    'logistic__penalty': ['l1', 'l2', 'elasticnet', 'none'],  
    'logistic__solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'], 
    'logistic__class_weight': [None, 'balanced'],  
    'logistic__multi_class': ['ovr', 'multinomial'],
    
}

log_reg_grid_search = GridSearchCV(logistic_pipeline, log_param_grid, cv=10, scoring='accuracy')

log_reg_grid_search.fit(X, y)

best_log_reg_model = log_reg_grid_search.best_estimator_
y_pred_log_reg = best_log_reg_model.predict(X)
y_proba_log_reg = best_log_reg_model.predict_proba(X)[:, 1]

print("Best Logistic Regression Parameters:", log_reg_grid_search.best_params_)



Best Logistic Regression Parameters: {'logistic__C': 1, 'logistic__class_weight': 'balanced', 'logistic__multi_class': 'ovr', 'logistic__penalty': 'l1', 'logistic__solver': 'liblinear'}
Best Logistic Regression Parameters: {'logistic__C': 1, 'logistic__class_weight': 'balanced', 'logistic__multi_class': 'ovr', 'logistic__penalty': 'l1', 'logistic__solver': 'liblinear'}


In [156]:
logistic_pipeline1 = Pipeline([
    ('preprocessing', ct),
    ('logistic', LogisticRegression(C=1.623776739188721, class_weight= None, multi_class = "ovr", penalty="l2", solver = "liblinear"))
])

logistic_pipeline1.fit(X, y)

scores = abs(cross_val_score(logistic_pipeline1, X, y, cv=5, scoring='accuracy'))
print(scores.mean())

0.6153297682709448
0.6153297682709448


In [142]:
logistic_pipeline2 = Pipeline([
    ('preprocessing', ct),
    ('logistic', LogisticRegression(C=1, class_weight= 'balanced', multi_class = "ovr", penalty="l1", solver = "liblinear"))
])

logistic_pipeline2.fit(X, y)

scores = abs(cross_val_score(logistic_pipeline2, X, y, cv=5, scoring='accuracy'))
print(scores.mean())

0.6213903743315508
0.6213903743315508


In [43]:
dtree_pipeline = Pipeline([
    ('preprocessing', ct),
    ('dtree', DecisionTreeClassifier())
])

tree_param_grid = {
    'dtree__max_depth': [5, 10, 15, 20],
    'dtree__min_samples_split': [2, 5, 10],
    'dtree__min_samples_leaf': [1, 2, 5],
    'dtree__criterion': ['gini', 'entropy']
}

dtree_grid_search = GridSearchCV(dtree_pipeline, tree_param_grid, cv=5, scoring='accuracy')

dtree_grid_search.fit(X, y)

# Evaluate Logistic Regression
best_dtree_model = dtree_grid_search.best_estimator_
y_pred_dtree = best_dtree_model.predict(X)
y_proba_dtree = best_dtree_model.predict_proba(X)[:, 1]

print("Best Dtree Parameters:", dtree_grid_search.best_params_)
print("Model Accuracy:", accuracy_score(y, y_pred_dtree))

Best Dtree Parameters: {'dtree__criterion': 'entropy', 'dtree__max_depth': 5, 'dtree__min_samples_leaf': 5, 'dtree__min_samples_split': 2}
Model Accuracy: 0.7218934911242604
Best Dtree Parameters: {'dtree__criterion': 'entropy', 'dtree__max_depth': 5, 'dtree__min_samples_leaf': 5, 'dtree__min_samples_split': 2}
Model Accuracy: 0.7218934911242604


In [64]:
dtree_pipeline = Pipeline([
    ('preprocessing', ct),
    ('dtree', DecisionTreeClassifier(criterion="entropy", max_depth=5, min_samples_leaf=5, min_samples_split=2))
])

dtree_pipeline.fit(X, y)

scores = abs(cross_val_score(dtree_pipeline, X, y, cv=5, scoring='accuracy'))
print(scores.mean())

0.4547237076648842
0.4547237076648842


In [59]:
svc_pipeline = Pipeline([
    ('preprocessing', ct), 
    ('svc', SVC(probability=True)) 
])

param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__degree': [2, 3, 4],
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': [0.001, 0.01, 0.1, 1]
}
svc_grid_search = GridSearchCV(svc_pipeline, param_grid, cv=5, scoring='accuracy')

svc_grid_search.fit(X, y)

best_svc_model = svc_grid_search.best_estimator_

y_pred_svc = best_svc_model.predict(X)
y_proba_svc = best_svc_model.predict_proba(X)[:, 1]

print("Best SVC Parameters:", svc_grid_search.best_params_)
print("Model Accuracy:", accuracy_score(y, y_pred_svc))

Best SVC Parameters: {'svc__C': 100, 'svc__degree': 2, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}
Model Accuracy: 0.7100591715976331
Best SVC Parameters: {'svc__C': 100, 'svc__degree': 2, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}
Model Accuracy: 0.7100591715976331


In [67]:
svc_pipeline = Pipeline([
    ('preprocessing', ct),
    ('svc', SVC(C=100, degree=2, gamma=0.001, kernel = "rbf"))
])

svc_pipeline.fit(X, y)

scores = abs(cross_val_score(svc_pipeline, X, y, cv=5, scoring='accuracy'))
print(scores.mean())

0.6329768270944742
0.6329768270944742


In [68]:
svm_pipeline = Pipeline([
    ('preprocessing', ct), 
    ('svm', SVC(kernel="poly", probability=True)) 
])

svm_param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__degree': [2, 3, 4],
    'svm__gamma': ['scale', 'auto', 0.01, 0.1],
    'svm__coef0': [0, 1, 10]
}

svm_grid_search = GridSearchCV(svm_pipeline, svm_param_grid, cv=5, scoring='accuracy')

svm_grid_search.fit(X, y)

best_svm_model = svm_grid_search.best_estimator_

y_pred_svm = best_svm_model.predict(X)
y_proba_svm = best_svm_model.predict_proba(X)[:, 1]

print("Best SVM Parameters:", svm_grid_search.best_params_)
print("Model Accuracy:", accuracy_score(y, y_pred_svm))

Best SVM Parameters: {'svm__C': 1, 'svm__coef0': 10, 'svm__degree': 2, 'svm__gamma': 0.01}
Model Accuracy: 0.7100591715976331
Best SVM Parameters: {'svm__C': 1, 'svm__coef0': 10, 'svm__degree': 2, 'svm__gamma': 0.01}
Model Accuracy: 0.7100591715976331


In [78]:
svm_pipeline = Pipeline([
    ('preprocessing', ct),
    ('svm', SVC(kernel="poly", probability=True, C=1, coef0=10, gamma=0.01, degree = 2))
])

svm_pipeline.fit(X, y)

scores = abs(cross_val_score(best_svc_model, X, y, cv=5, scoring='accuracy'))
print(scores.mean())

0.6329768270944742
0.6329768270944742


In [87]:
knn_pipeline = Pipeline([
    ('preprocessing', ct), 
    ('knn', KNeighborsClassifier()) 
])


knn_param_grid = {
    'knn__n_neighbors': [3, 5, 10, 15],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}


knn_grid_search = GridSearchCV(knn_pipeline, knn_param_grid, cv=5, scoring='accuracy')

knn_grid_search.fit(X, y)

best_knn_model = knn_grid_search.best_estimator_

y_pred_knn = best_knn_model.predict(X)
y_proba_knn = best_knn_model.predict_proba(X)[:, 1]

print("Best KNN Parameters:", knn_grid_search.best_params_)
print("Model Accuracy:", accuracy_score(y, y_pred_knn))

Best KNN Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 3, 'knn__weights': 'uniform'}
Model Accuracy: 0.7692307692307693
Best KNN Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 3, 'knn__weights': 'uniform'}
Model Accuracy: 0.7692307692307693


In [90]:
knn_pipeline = Pipeline([
    ('preprocessing', ct),
    ('knn', KNeighborsClassifier(metric="euclidean", n_neighbors=3, weights="uniform"))
])

knn_pipeline.fit(X, y)

scores = abs(cross_val_score(knn_pipeline, X, y, cv=5, scoring='accuracy'))
print(scores.mean())

0.5853832442067737
0.5853832442067737


In [70]:
test_data = pd.read_csv("CAH-201803-test.csv")

In [144]:
final_predictions3 = pd.DataFrame(
    {"id_num": test_data['id_num'],
    "political_affiliation_predicted": logistic_pipeline2.predict(test_data)}
)


In [145]:
final_predictions3

Unnamed: 0,id_num,political_affiliation_predicted
0,2,Republican
1,3,Democrat
2,4,Independent
3,6,Independent
4,11,Independent
...,...,...
161,327,Democrat
162,330,Independent
163,331,Democrat
164,333,Democrat


In [146]:
final_predictions3.to_csv('final_predictions3.csv', index=False)


## Regression

In [81]:
train2 = pd.read_csv("train_new.csv")

In [82]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2197 entries, 0 to 2196
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SalePrice      2197 non-null   int64  
 1   PID            2197 non-null   int64  
 2   Lot Frontage   1835 non-null   float64
 3   Lot Area       2197 non-null   int64  
 4   Street         2197 non-null   object 
 5   Neighborhood   2197 non-null   object 
 6   Bldg Type      2197 non-null   object 
 7   House Style    2197 non-null   object 
 8   Overall Qual   2197 non-null   int64  
 9   Overall Cond   2197 non-null   int64  
 10  Year Built     2197 non-null   int64  
 11  Roof Style     2197 non-null   object 
 12  Heating        2197 non-null   object 
 13  Central Air    2197 non-null   object 
 14  Electrical     2196 non-null   object 
 15  Full Bath      2197 non-null   int64  
 16  Half Bath      2197 non-null   int64  
 17  Bedroom AbvGr  2197 non-null   int64  
 18  TotRms A

Lot Frontage has NAN values

In [89]:
X = train2.drop(['SalePrice', 'PID'], axis = 1)
y = train2['SalePrice']

# ChatGPT recommended that I use Imputer to fill in the NAN values in Lot Frontage
# I used ChatGPT to get this column transformer 
ct = ColumnTransformer([
  ('numeric', Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())
    ]), make_column_selector(dtype_include=np.number)),
    ('categorical', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), make_column_selector(dtype_include=object))
])

elastic_pipeline = Pipeline([
    ('preprocessing', ct),
    ('elastic', ElasticNet())
])

param_grid = {
    'elastic__alpha': np.logspace(-3, 1, 10),  
    'elastic__l1_ratio': np.linspace(0, 1, 10) 
}

elastic_grid_search = GridSearchCV(elastic_pipeline, param_grid, cv=10, scoring='neg_root_mean_squared_error', n_jobs=-1)
elastic_grid_search.fit(X, y)

print("Best Elastic Regression Parameters:", elastic_grid_search.best_params_)
print("Best Elastic Regression Score:", elastic_grid_search.best_score_)

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(


Best Elastic Regression Parameters: {'elastic__alpha': 0.007742636826811269, 'elastic__l1_ratio': 0.7777777777777777}
Best Elastic Regression Score: -32758.46763068562
Best Elastic Regression Parameters: {'elastic__alpha': 0.007742636826811269, 'elastic__l1_ratio': 0.7777777777777777}
Best Elastic Regression Score: -32758.46763068562


In [87]:
elastic_pipeline = Pipeline([
    ('preprocessing', ct),
    ('elastic', ElasticNet(alpha=0.007742636826811269, l1_ratio=0.7777777777777777))
])

elastic_pipeline.fit(X, y)

scores = abs(cross_val_score(elastic_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error'))
print(scores.mean())

32977.55096688384
32977.55096688384


RSME is very high so ChatGPT recommended that I Log-Transform my target variable

In [94]:
X = train2.drop(['SalePrice', 'PID'], axis = 1)
# Log transforming SalePrice to reduce RSME used ChatGPT to get this log transformation
y = np.log(train2['SalePrice'])

ct2 = ColumnTransformer([
  ('numeric', Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())
    ]), make_column_selector(dtype_include=np.number)),
    ('categorical', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]), make_column_selector(dtype_include=object))
])

elastic_pipeline2 = Pipeline([
    ('preprocessing', ct2),
    ('elastic', ElasticNet())
])

param_grid = {
    'elastic__alpha': np.logspace(-3, 1, 10),  
    'elastic__l1_ratio': np.linspace(0, 1, 10) 
}

elastic_grid_search2 = GridSearchCV(elastic_pipeline2, param_grid, cv=10, scoring='neg_root_mean_squared_error')

elastic_grid_search2.fit(X, y)


print("Best Elastic Regression Parameters:", elastic_grid_search2.best_params_)


Best Elastic Regression Parameters: {'elastic__alpha': 0.0027825594022071257, 'elastic__l1_ratio': 0.0}
Best Elastic Regression Parameters: {'elastic__alpha': 0.0027825594022071257, 'elastic__l1_ratio': 0.0}


In [96]:
elastic_pipeline2 = Pipeline([
    ('preprocessing', ct2),
    ('elastic', ElasticNet(alpha=0.0027825594022071257, l1_ratio=0.0))
])

elastic_pipeline2.fit(X_train, y_train)

scores = abs(cross_val_score(elastic_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error'))
print(scores.mean())

0.14865112983629009
0.14865112983629009


In [97]:
test2 = pd.read_csv("test_new.csv")

In [99]:
attempt1_model = elastic_pipeline2.fit(X, y)

X_test = test2.drop(columns=['PID'])
PID = test2['PID']

y_pred_log = attempt1_model.predict(X_test) 
# Exp to undo the log transformation
y_pred = np.exp(y_pred_log) 

results_df = pd.DataFrame({
    'PID': PID, 
    'SalePrice': y_pred
})

results_df.to_csv('attempt1_house_prices.csv', index=False)


605