In [0]:
%run "/housing prices project/classes and functions"

In [0]:
train_df = spark.read.format("delta").table("train_df")
test_df = spark.read.format("delta").table("test_df")

### process train_df for analysis

In [0]:
processer_train = preprocessing(train_df)
train_for_analysis = processer_train.analysis_transform()

In [0]:
train_for_analysis.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("train_for_analysis")

### split dataframes into x and y

In [0]:
train_for_analysis_x = train_for_analysis.drop("expensive")
train_y = train_for_analysis.select("expensive")
test_df_x = test_df.drop("expensive")
test_y = test_df.select("expensive")

### final process of train_df

In [0]:
processer_train = preprocessing(train_for_analysis_x)
processed_train = processer_train.train_transform()

In [0]:
processed_train.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("processed_train")

### final process of test_df

In [0]:
processer_test = preprocessing(test_df_x)
processed_test = processer_test.analysis_transform()
processer_test = preprocessing(processed_test)
processed_test = processer_test.test_transform()


In [0]:
processed_test.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("processed_test")

In [0]:
X_train = processed_train.toPandas()
X_test = processed_test.toPandas()
y_train = train_y.toPandas()
y_test = test_y.toPandas()

In [0]:
test_df_x = test_df_x.toPandas()
train_df = train_df.toPandas()


### Data balancing

In [0]:
from imblearn.over_sampling import SMOTE
# adding more rows with expensive = 0
os = SMOTE(random_state=2023)
over = SMOTE(sampling_strategy=0.75)
X, y = over.fit_resample(X_train, y_train)

### Data Scaling

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.fit_transform(X_test)

In [0]:
y = np.ravel(y)

### sklearn's logistic regression results

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_auc_score

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_scaled, y)
y_pred = logreg.predict(X_test_scaled)

accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred)

print(f"accuracy Score: {accuracy:.2f}")
print(f"precision Score: {precision:.2f}")
print(f"recall Score: {recall:.2f}")
print(f"f1 Score: {f1_score:.2f}")
print(f"AUC-ROC Score: {auc_roc:.2f}")



accuracy Score: 0.77
precision Score: 0.91
recall Score: 0.76
f1 Score: 0.83
AUC-ROC Score: 0.78


### My logistic regression results

In [0]:

logreg = MyLogisticRegression()
logreg.fit(X_scaled, y)
y_pred = logreg.predict(X_test_scaled, threshold = 0.5)

accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred)

print(f"accuracy Score: {accuracy:.2f}")
print(f"precision Score: {precision:.2f}")
print(f"recall Score: {recall:.2f}")
print(f"f1 Score: {f1_score:.2f}")
print(f"AUC-ROC Score: {auc_roc:.2f}")


accuracy Score: 0.69
precision Score: 0.89
recall Score: 0.66
f1 Score: 0.76
AUC-ROC Score: 0.72
