In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.preprocessing import StandardScaler

In [54]:
df=pd.read_csv("C:/Users/Cemil Turhan/Desktop/MachineLearning_FinalProject/Numerical_train.csv", index_col=0)

In [55]:
target = [0 if price <= 120000 else 1 if price <= 200000 else 2 for price in df.SalePrice]
df = df.drop(labels = 'SalePrice', axis=1).astype("float64")

In [56]:
X_train, X_test, y_train, y_test = train_test_split(df, target)

# Hyperparameter Tuning for Classification

### Grid Search for Random Forest

In [None]:
rf=RandomForestClassifier()
n_estimators = [100, 300, 500, 700, 800, 900, 1000, 1200, 1350, 1500]
max_depth = [2, 5, 10, 15, 20, 25, 30, 40, 50]
min_samples_split = [1, 2, 4, 5, 8, 10]
min_samples_leaf = [2, 3, 4, 5, 8, 10]
param_grid = dict(n_estimators = n_estimators,
                  max_depth = max_depth,
                  min_samples_split = min_samples_split,
                  min_samples_leaf = min_samples_leaf)
rf_cv = GridSearchCV(estimator = rf, param_grid = param_grid, cv=5)
rf_cv.fit(X_train, y_train)
rf_cv.best_params_

In [6]:
best_params_rf={'max_depth': 20,
 'min_samples_leaf': 2,
 'min_samples_split': 8,
 'n_estimators': 100}

### Grid Search for Support Vector Machine

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [13]:
C = [0.00001, 0.0001, 0.001,  0.01, 0.1, 1, 5, 10]
gamma = [0.0001,  0.001,  0.01, 0.1, 1, 5, 10]
kernel=["linear", "rbf", "poly","sigmoid"]
param_grid = dict(C = C,
                  gamma = gamma, kernel=kernel)
svm_cv = GridSearchCV(estimator = svm, param_grid = param_grid, cv=3)
svm_cv.fit(X_train_scaled, y_train)
svm_cv.best_params_

{'C': 0.01, 'gamma': 0.0001, 'kernel': 'linear'}

In [14]:
best_params_svm={'C': 0.01, 'gamma': 0.0001, 'kernel': 'linear'}

###  RF and SVM Models with best parameters

In [58]:
rf = RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=8, n_estimators=100)
svm= SVC(C=0.01, gamma=0.0001, kernel="linear")

In [42]:
def clf_models():
    rf = RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=8, n_estimators=100)
    svm= SVC(C=0.01, gamma=0.0001, kernel="linear")
    return rf, svm

In [43]:
for name, model in zip(['rf', 'svm'],
                       [*clf_models()]):
    if name == 'rf':
        pipeline = Pipeline([('scaler', None), ('estimator', model)])
    else:
        pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', model)])
    scores = cross_validate(pipeline, X_train, y_train, cv=3, scoring="accuracy")
    
    print(name, 'accuracy', scores['test_score'].mean())
    print('\n')

rf accuracy 0.8418611069296


svm accuracy 0.8555747905062975




In [60]:
rf.fit(X_train, y_train)
svm.fit(X_train, y_train)

SVC(C=0.01, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [62]:
print(rf.score(X_test, y_test))
print(svm.score(X_test, y_test))

0.8547945205479452
0.8465753424657534


### Now we'll do RF and SVM models on selected features by GBR before

In [63]:
selected_data=df[['MSZoning', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'BsmtQual',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtUnfSF',
       'TotalBsmtSF', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'GarageFinish',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'Neighborhood_Crawfor']]

In [64]:
selected_data.head()

Unnamed: 0,MSZoning,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtQual,...,2ndFlrSF,GrLivArea,KitchenQual,TotRmsAbvGrd,Fireplaces,GarageFinish,GarageArea,WoodDeckSF,OpenPorchSF,Neighborhood_Crawfor
0,3.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,4.0,4.0,...,854.0,1710.0,4.0,8.0,0.0,2.0,548.0,0.0,61.0,0.0
1,3.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,3.0,4.0,...,0.0,1262.0,3.0,6.0,1.0,2.0,460.0,298.0,0.0,0.0
2,3.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,4.0,4.0,...,866.0,1786.0,4.0,6.0,1.0,2.0,608.0,0.0,42.0,0.0
3,3.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,3.0,3.0,...,756.0,1717.0,4.0,7.0,1.0,1.0,642.0,0.0,35.0,1.0
4,3.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,4.0,4.0,...,1053.0,2198.0,4.0,9.0,1.0,2.0,836.0,192.0,84.0,0.0


In [65]:
X_train, X_test, y_train, y_test = train_test_split(selected_data, target)

In [66]:
for name, model in zip(['rf', 'svm'],
                       [*clf_models()]):
    if name == 'rf':
        pipeline = Pipeline([('scaler', None), ('estimator', model)])
    else:
        pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', model)])
    scores = cross_validate(pipeline, X_train, y_train, cv=3, scoring="accuracy")
    
    print(name, 'accuracy', scores['test_score'].mean())
    print('\n')

rf accuracy 0.841881178182548


svm accuracy 0.8565030859551408




In [67]:
rf.fit(X_train, y_train)
svm.fit(X_train, y_train)

SVC(C=0.01, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [69]:
print(rf.score(X_test, y_test))
print(svm.score(X_test, y_test))

0.8164383561643835
0.8164383561643835
