In [48]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean
from sklearn.metrics import mean_squared_error

In [22]:
data = pd.read_csv('./ManualPreprocessedAmesHousing.csv')
data = data[['Gr Liv Area', '1st Flr SF', 'Garage Area', 'Overall Qual', 'Total Bsmt SF','SalePrice']]
X = data[['Gr Liv Area', '1st Flr SF', 'Garage Area', 'Overall Qual', 'Total Bsmt SF']]
y = data['SalePrice']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [24]:
linearModel = LinearRegression()
linearModel.fit(X_train, y_train)
print(linearModel.score(X_test, y_test))

0.7348711998606736


In [25]:
cross_val_scores_ridge = []
ridge_alpha = []
for i in range(1,9):
    ridgeModel = Ridge(alpha = i * 0.25)
    ridgeModel.fit(X_train, y_train)
    cross_val_scores_ridge.append(mean(cross_val_score(ridgeModel, X, y, cv=5)))
    ridge_alpha.append(i * 0.25)
ridge_max = 0
rideg_maxAlpha = 0
for i in range(len(ridge_alpha)):
    if cross_val_scores_ridge[i] > ridge_max:
        ridge_max = cross_val_scores_ridge[i]
        rideg_maxAlpha = ridge_alpha[i]
    print(ridge_alpha[i], ' : ', cross_val_scores_ridge[i])
print('Best alpha: ', rideg_maxAlpha , ' with score: ', ridge_max)


0.25  :  0.772569506885539
0.5  :  0.7725693432816655
0.75  :  0.772569177353646
1.0  :  0.7725690091029754
1.25  :  0.7725688385311469
1.5  :  0.7725686656396532
1.75  :  0.772568490429986
2.0  :  0.7725683129036358
Best alpha:  0.25  with score:  0.772569506885539


In [27]:
cross_val_scores_lasso = []
lasso_alpha = []
for i in range(1,9):
    lassoModel = Lasso(alpha = i * 0.25)
    lassoModel.fit(X_train, y_train)
    cross_val_scores_lasso.append(mean(cross_val_score(lassoModel, X, y, cv=5)))
    lasso_alpha.append(i * 0.25)
lasso_max = 0
lasso_maxAlpha = 0
for i in range(len(lasso_alpha)):
    if cross_val_scores_lasso[i] > lasso_max:
        lasso_max = cross_val_scores_lasso[i]
        lasso_maxAlpha = lasso_alpha[i]
    print(lasso_alpha[i], ' : ', cross_val_scores_lasso[i])
print('Best alpha: ', lasso_maxAlpha , ' with score: ', lasso_max)

0.25  :  0.7725400905967027
0.5  :  0.7724903879341928
0.75  :  0.7724205301831705
1.0  :  0.7723305672087802
1.25  :  0.7722204427702153
1.5  :  0.7720901904942878
1.75  :  0.771939801019731
2.0  :  0.7717692803581652
Best alpha:  0.25  with score:  0.7725400905967027


In [47]:
# Hyper parameter for ridge model alpha = 0.25 
# Hyper parameter for lasso model alpha = 0.25
dataset = pd.read_csv('./ManualPreprocessedAmesHousing.csv')
dataset = dataset[['Gr Liv Area', '1st Flr SF', 'Garage Area', 'Overall Qual', 'Total Bsmt SF','SalePrice']]
X = dataset[['Gr Liv Area', '1st Flr SF', 'Garage Area', 'Overall Qual', 'Total Bsmt SF']]
y = dataset['SalePrice']

In [53]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Linear Regression
linearModel = LinearRegression()
linearModel.fit(x_train, y_train)
# linear regression coefficients
print('Linear Regression Coefficients: ', linearModel.coef_ * 1000)
print('Linear Regression MSE: ', mean_squared_error(y_test, linearModel.predict(x_test)))
print('Linear Regression R2: ', linearModel.score(x_test, y_test))

# Ridge Regression
ridgeModel = Ridge(alpha = 0.25)
ridgeModel.fit(x_train, y_train)
print('Ridge Regression MSE: ', mean_squared_error(y_test, ridgeModel.predict(x_test)))
print('Ridge Regression R2: ', ridgeModel.score(x_test, y_test))

# Lasso Regression
lassoModel = Lasso(alpha = 0.25)
lassoModel.fit(x_train, y_train)
print('Lasso Regression MSE: ', mean_squared_error(y_test, lassoModel.predict(x_test)))
print('Lasso Regression R2: ', lassoModel.score(x_test, y_test))

Linear Regression Coefficients:  [4.49594928e+01 1.91467954e+01 5.36670731e+01 2.48194719e+04
 2.52324190e+01]
Linear Regression MSE:  1740.622558774348
Linear Regression R2:  0.7348711998606736
Ridge Regression MSE:  1740.6367419672715
Ridge Regression R2:  0.7348690394997711
Lasso Regression MSE:  1742.0091944400904
Lasso Regression R2:  0.7346599897689535


In [54]:
x_train.head()

Unnamed: 0,Gr Liv Area,1st Flr SF,Garage Area,Overall Qual,Total Bsmt SF
2073,2060,1164,521,6,1151
2754,1614,1614,865,7,1614
1282,1117,1117,264,5,1117
1572,2898,2898,665,8,1565
641,1866,1866,495,5,1866


$$ 45 \times Gr Liv Area + 19 \times 1st Flr SF + 54 \times Garage Area + 25 \times Overall Qual + 25 \times Total Bsmt SF$$

In [51]:
from sklearn.decomposition import PCA
x_train_pca = PCA(n_components=2).fit_transform(x_train)
x_test_pca = PCA(n_components=2).fit_transform(x_test)

# Linear Regression
linearModel = LinearRegression()
linearModel.fit(x_train_pca, y_train)
print('Linear Regression MSE: ', mean_squared_error(y_test, linearModel.predict(x_test_pca)))
print('Linear Regression R2: ', linearModel.score(x_test_pca, y_test))

# Ridge Regression
ridgeModel = Ridge(alpha = 0.25)
ridgeModel.fit(x_train_pca, y_train)
print('Ridge Regression MSE: ', mean_squared_error(y_test, ridgeModel.predict(x_test_pca)))
print('Ridge Regression R2: ', ridgeModel.score(x_test_pca, y_test))

# Lasso Regression
lassoModel = Lasso(alpha = 0.25)
lassoModel.fit(x_train_pca, y_train)
print('Lasso Regression MSE: ', mean_squared_error(y_test, lassoModel.predict(x_test_pca)))
print('Lasso Regression R2: ', lassoModel.score(x_test_pca, y_test))

Linear Regression MSE:  2972.3890403618366
Linear Regression R2:  0.5472505306530493
Ridge Regression MSE:  2972.38904004436
Ridge Regression R2:  0.5472505307014067
Lasso Regression MSE:  2972.3850198464374
Lasso Regression R2:  0.5472511430514226


# Decision Tree

In [45]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA

In [33]:
data = pd.read_csv('./ManualPreprocessedAmesHousingClassification.csv')
data = data[['Gr Liv Area', '1st Flr SF', 'Garage Area', 'Overall Qual', 'Total Bsmt SF','SalePrice']]
X = data[['Gr Liv Area', '1st Flr SF', 'Garage Area', 'Overall Qual', 'Total Bsmt SF']]
y = data['SalePrice']

In [43]:
print("Original Dataset: ")
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
dt = DecisionTreeClassifier(criterion="gini", max_depth=6,min_samples_leaf=11)
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# calculate the fpr and tpr for all thresholds of the classification
print("ROC_AUC score: ", roc_auc_score(y_test, dt.predict_proba(x_test), multi_class='ovr'))

Original Dataset: 
Accuracy: 0.6604095563139932
ROC_AUC score:  0.884023389385133


In [46]:
print("PCA Dataset: ")
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
pca_x_train = PCA(n_components=2).fit_transform(x_train)
pca_x_test = PCA(n_components=2).fit_transform(x_test)
dt = DecisionTreeClassifier(criterion="gini", max_depth=6,min_samples_leaf=11)
dt.fit(pca_x_train, y_train)
y_pred_pca = dt.predict(pca_x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_pca))
# calculate the fpr and tpr for all thresholds of the classification
print("ROC_AUC score: ", roc_auc_score(y_test, dt.predict_proba(pca_x_test), multi_class='ovr'))

PCA Dataset: 
Accuracy: 0.568259385665529
ROC_AUC score:  0.8259740155589025
