In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
house_price = pd.read_csv("Data/housing_iter_6/housing-classification-iter6.csv")
house_price

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


In [3]:
house_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual

In [4]:
house_price.drop(columns = ["Street", "BsmtExposure", "FireplaceQu", "MSSubClass", "MasVnrArea", "MoSold",
                            "YrSold", "Id", "Alley", "LotShape", "LotConfig", "LandSlope", "RoofStyle", 
                            "RoofMatl", "MasVnrType", "PoolQC", "Fence", "MiscFeature", "SaleType"], inplace = True)
house_price

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,BsmtFinType2,HeatingQC,Electrical,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Unf,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Unf,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Unf,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Unf,Gd,SBrkr,Typ,Detchd,Unf,TA,TA,Y,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Unf,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Unf,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,Normal
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Rec,TA,SBrkr,Min1,Attchd,Unf,TA,TA,Y,Normal
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Unf,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,Normal
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Rec,Gd,FuseA,Typ,Attchd,Unf,TA,TA,Y,Normal


In [5]:
house_price[["OverallQual", "OverallCond"]] = house_price[["OverallQual", "OverallCond"]].astype(str)

In [6]:
y = house_price.pop("Expensive")
X = house_price.copy()

In [7]:
# split the dataset into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

In [10]:
# select categorical and numerical column names
X_num_col = X.select_dtypes(include = "number").copy().columns
X_cat_col = X.select_dtypes(exclude = "number").copy().columns

# define ordinal and onehot columns
onehot_col = X_cat_col.get_indexer(["MSZoning", "Condition1", "Heating", "CentralAir", "Foundation", 
                                    "LandContour", "Utilities", "Neighborhood", "Condition2", "BldgType", 
                                    "HouseStyle", "Exterior1st", "Exterior2nd", "GarageType", "GarageFinish",
                                    "PavedDrive", "SaleCondition"])
ord_col = X_cat_col.get_indexer(["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtFinType1", 
                                 "KitchenQual", "OverallQual", "OverallCond", "BsmtFinType2", "HeatingQC", 
                                 "Electrical", "Functional", 'GarageQual', 'GarageCond'])

# define categories for all ordinal columns
exqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
excond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtcond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtfin1_cat = ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "NA"]
kitqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
overqual_cat = ["10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "NA"]
overcond_cat = ["10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "NA"]
bsmtfin2_cat = ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "NA"]
heatqc_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
elec_cat = ["SBrkr", "FuseA", "FuseF", "FuseP", "Mix", "NA"] 
func_cat = ["Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal", "NA"]
garqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
garcond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]

# define encoder for all categorial columns
categorical_encoder = make_column_transformer(
    (OneHotEncoder(handle_unknown = "ignore"), onehot_col),
    (OrdinalEncoder(categories = [exqual_cat, excond_cat, bsmtqual_cat, bsmtcond_cat, bsmtfin1_cat, 
                                  kitqual_cat, overqual_cat, overcond_cat, bsmtfin2_cat, heatqc_cat, 
                                  elec_cat, func_cat, garqual_cat, garcond_cat]), ord_col))

# create pipeline for all categorical columns
cat_pipeline = make_pipeline(SimpleImputer(strategy = "constant", fill_value = "NA"), categorical_encoder)

# create pipeline for all numerical columns
num_pipeline = make_pipeline(SimpleImputer(), StandardScaler())

# create pipeline for the entire preprocessing for all columns
preprocess = ColumnTransformer(transformers = [("num_pipeline", num_pipeline, X_num_col),
                                               ("cat_pipeline", cat_pipeline, X_cat_col)])

In [11]:
# create the full pipeline with decision tree
tree_pipeline = make_pipeline(preprocess, DecisionTreeClassifier())
tree_pipeline.fit(X_train, y_train)
tree_pipeline.predict(X_train)

array([0, 0, 0, ..., 1, 0, 1])

In [12]:
# define parameter grid and use grid search to find the best hyperparameter
tree_param_grid = {"columntransformer__num_pipeline__simpleimputer__strategy": ["mean", "median"],
                   "decisiontreeclassifier__max_depth": range(1, 10),
                   "decisiontreeclassifier__min_samples_leaf": range(5, 31, 5),
                   "decisiontreeclassifier__min_samples_split": range(3, 41, 5),
                   "decisiontreeclassifier__criterion": ["gini", "entropy"]
                  }
tree_search = GridSearchCV(tree_pipeline, tree_param_grid, cv = 5, verbose = 1)
tree_search.fit(X_train, y_train)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num_pipeline',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer()),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'YearBuilt',
       'YearRemodAdd...
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
            

In [13]:
tree_search.best_params_

{'columntransformer__num_pipeline__simpleimputer__strategy': 'mean',
 'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 5,
 'decisiontreeclassifier__min_samples_leaf': 20,
 'decisiontreeclassifier__min_samples_split': 8}

In [14]:
tree_search.best_score_

0.9289461134954697

In [15]:
y_train_tree_pred = tree_search.predict(X_train)
accuracy_score(y_train, y_train_tree_pred)

0.9511986301369864

In [16]:
y_test_tree_pred = tree_search.predict(X_test)
accuracy_score(y_test, y_test_tree_pred)

0.9383561643835616

In [17]:
# create the full pipeline with knn algorithm
knn_pipeline = make_pipeline(preprocess, KNeighborsClassifier())
knn_pipeline.fit(X_train, y_train)
knn_pipeline.predict(X_train)

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
# define parameter grid and use grid search to find the best hyperparameter
knn_param_grid = {"columntransformer__num_pipeline__simpleimputer__strategy": ["mean", "median"],
                  "kneighborsclassifier__n_neighbors": range(5, 70),
                  "kneighborsclassifier__weights": ["uniform", "distance"],
                  "kneighborsclassifier__leaf_size": range(25, 35)
             }
knn_search = GridSearchCV(knn_pipeline, knn_param_grid, cv = 5, verbose = 1)
knn_search.fit(X_train, y_train)

Fitting 5 folds for each of 2600 candidates, totalling 13000 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num_pipeline',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer()),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'YearBuilt',
       'YearRemodAdd...
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleCondition'],
      dtype='object'))])),
                

In [19]:
knn_search.best_params_

{'columntransformer__num_pipeline__simpleimputer__strategy': 'median',
 'kneighborsclassifier__leaf_size': 25,
 'kneighborsclassifier__n_neighbors': 9,
 'kneighborsclassifier__weights': 'uniform'}

In [20]:
knn_search.best_score_

0.93920252375188

In [21]:
y_train_knn_pred = knn_search.predict(X_train)
accuracy_score(y_train, y_train_knn_pred)

0.946917808219178

In [22]:
y_test_knn_pred = knn_search.predict(X_test)
accuracy_score(y_test, y_test_knn_pred)

0.9383561643835616

In [23]:
# create the full pipeline with logistic regression algorithm
logreg_pipeline = make_pipeline(preprocess, LogisticRegression(max_iter = 5000))
logreg_pipeline.fit(X_train, y_train)
logreg_pipeline.predict(X_train)

array([0, 0, 0, ..., 1, 0, 0])

In [24]:
logreg_param_grid = {"columntransformer__num_pipeline__simpleimputer__strategy": ["mean", "median"],
                     "logisticregression__penalty": ["l2", "none"],
                     "logisticregression__C": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
                     # "logisticregression__solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
                    }
logreg_search = GridSearchCV(logreg_pipeline, logreg_param_grid, cv = 5, verbose = 1)
logreg_search.fit(X_train, y_train)

Fitting 5 folds for each of 44 candidates, totalling 220 fits




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num_pipeline',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer()),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'YearBuilt',
       'YearRemodAdd...
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleCondition'],
      dtype='object'))])),
                

In [25]:
logreg_search.best_params_

{'columntransformer__num_pipeline__simpleimputer__strategy': 'mean',
 'logisticregression__C': 0.5,
 'logisticregression__penalty': 'l2'}

In [26]:
logreg_search.best_score_

0.9520523825244854

In [27]:
y_train_logreg_pred = logreg_search.predict(X_train)
accuracy_score(y_train, y_train_logreg_pred)

0.9700342465753424

In [28]:
y_test_logreg_pred = logreg_search.predict(X_test)
accuracy_score(y_test, y_test_logreg_pred)

0.9554794520547946

In [29]:
# create the full pipeline with SVM algorithm
svm_pipeline = make_pipeline(preprocess, SVC())
svm_pipeline.fit(X_train, y_train)
svm_pipeline.predict(X_train)

array([0, 0, 0, ..., 1, 0, 0])

In [30]:
svm_param_grid = {"columntransformer__num_pipeline__simpleimputer__strategy": ["mean", "median"],
                  "svc__gamma": ["scale", "auto"],
                  "svc__C": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
                  "svc__kernel": ["linear", "poly", "rbf", "sigmoid"]
                 }
svm_search = GridSearchCV(svm_pipeline, svm_param_grid, cv = 5, verbose = 1)
svm_search.fit(X_train, y_train)

Fitting 5 folds for each of 176 candidates, totalling 880 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num_pipeline',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer()),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'YearBuilt',
       'YearRemodAdd...
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleCondition'],
      dtype='object'))])),
                

In [31]:
svm_search.best_params_

{'columntransformer__num_pipeline__simpleimputer__strategy': 'mean',
 'svc__C': 0.7,
 'svc__gamma': 'scale',
 'svc__kernel': 'linear'}

In [32]:
svm_search.best_score_

0.9520523825244854

In [33]:
y_train_svm_pred = svm_search.predict(X_train)
accuracy_score(y_train, y_train_svm_pred)

0.976027397260274

In [34]:
y_test_svm_pred = svm_search.predict(X_test)
accuracy_score(y_test, y_test_svm_pred)

0.9383561643835616

In [35]:
# create the full pipeline with random forest classifier algorithm
forest_pipeline = make_pipeline(preprocess, RandomForestClassifier())
forest_pipeline.fit(X_train, y_train)
forest_pipeline.predict(X_train)

array([0, 0, 0, ..., 1, 0, 1])

In [73]:
forest_param_grid = {"columntransformer__num_pipeline__simpleimputer__strategy": ["mean", "median"],
                     "randomforestclassifier__n_estimators": range(10, 101, 10),
                     "randomforestclassifier__criterion": ["gini", "entropy"],
                     "randomforestclassifier__max_depth": range(1, 10),
                     # "randomforestclassifier__min_samples_leaf": range(5, 31, 5),
                     "randomforestclassifier__max_features": ["sqrt", "log2"],
                     "randomforestclassifier__bootstrap": [True, False]                    
                    }
forest_search = GridSearchCV(forest_pipeline, forest_param_grid, cv = 5, verbose = 1)
forest_search.fit(X_train, y_train)

Fitting 5 folds for each of 8640 candidates, totalling 43200 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num_pipeline',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer()),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'YearBuilt',
       'YearRemodAdd...
             param_grid={'columntransformer__num_pipeline__simpleimputer__strategy': ['mean',
                                                     

In [74]:
forest_search.best_params_

{'columntransformer__num_pipeline__simpleimputer__strategy': 'mean',
 'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__criterion': 'entropy',
 'randomforestclassifier__max_depth': 8,
 'randomforestclassifier__max_features': 'sqrt',
 'randomforestclassifier__min_samples_leaf': 5,
 'randomforestclassifier__n_estimators': 30}

In [75]:
forest_search.best_score_

0.9494699387403249

In [76]:
y_train_forest_pred = forest_search.predict(X_train)
accuracy_score(y_train, y_train_forest_pred)

0.964041095890411

In [77]:
y_test_forest_pred = forest_search.predict(X_test)
accuracy_score(y_test, y_test_forest_pred)

0.9554794520547946

In [41]:
# read in the test file for the prediction
test = pd.read_csv("Data/classification_test.csv")
test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [42]:
test_cp = test.drop(columns = ["Street", "BsmtExposure", "FireplaceQu", "MSSubClass", "MasVnrArea", "MoSold",
                            "YrSold", "Id", "Alley", "LotShape", "LotConfig", "LandSlope", "RoofStyle", 
                            "RoofMatl", "MasVnrType", "PoolQC", "Fence", "MiscFeature", "SaleType"])
test_cp

Unnamed: 0,MSZoning,LotFrontage,LotArea,LandContour,Utilities,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,...,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,SaleCondition
0,RH,80.0,11622,Lvl,AllPub,NAmes,Feedr,Norm,1Fam,1Story,...,TA,Y,140,0,0,0,120,0,0,Normal
1,RL,81.0,14267,Lvl,AllPub,NAmes,Norm,Norm,1Fam,1Story,...,TA,Y,393,36,0,0,0,0,12500,Normal
2,RL,74.0,13830,Lvl,AllPub,Gilbert,Norm,Norm,1Fam,2Story,...,TA,Y,212,34,0,0,0,0,0,Normal
3,RL,78.0,9978,Lvl,AllPub,Gilbert,Norm,Norm,1Fam,2Story,...,TA,Y,360,36,0,0,0,0,0,Normal
4,RL,43.0,5005,HLS,AllPub,StoneBr,Norm,Norm,TwnhsE,1Story,...,TA,Y,0,82,0,0,144,0,0,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,RM,21.0,1936,Lvl,AllPub,MeadowV,Norm,Norm,Twnhs,2Story,...,,Y,0,0,0,0,0,0,0,Normal
1455,RM,21.0,1894,Lvl,AllPub,MeadowV,Norm,Norm,TwnhsE,2Story,...,TA,Y,0,24,0,0,0,0,0,Abnorml
1456,RL,160.0,20000,Lvl,AllPub,Mitchel,Norm,Norm,1Fam,1Story,...,TA,Y,474,0,0,0,0,0,0,Abnorml
1457,RL,62.0,10441,Lvl,AllPub,Mitchel,Norm,Norm,1Fam,SFoyer,...,,Y,80,32,0,0,0,0,700,Normal


In [43]:
test_cp[["OverallQual", "OverallCond"]] = test_cp[["OverallQual", "OverallCond"]].astype(str)

In [50]:
test_cp = test_cp[['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces', 'PoolArea', 
                   'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSZoning', 'Condition1', 'Heating', 
                   'CentralAir', 'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
                   'BsmtFinType1', 'KitchenQual', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
                   'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
                   'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenAbvGr', 
                   'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
                   'MiscVal', 'LandContour', 'Utilities', 'Neighborhood', 'Condition2', 'BldgType', 
                   'HouseStyle', 'Exterior1st', 'Exterior2nd', 'BsmtFinType2', 'HeatingQC', 'Electrical', 
                   'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 
                   'SaleCondition']]

In [52]:
test_cp.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,BsmtFinType2,HeatingQC,Electrical,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleCondition
0,11622,80.0,882.0,2,0,0,1.0,140,120,RH,...,LwQ,TA,SBrkr,Typ,Attchd,Unf,TA,TA,Y,Normal
1,14267,81.0,1329.0,3,0,0,1.0,393,0,RL,...,Unf,TA,SBrkr,Typ,Attchd,Unf,TA,TA,Y,Normal
2,13830,74.0,928.0,3,1,0,2.0,212,0,RL,...,Unf,Gd,SBrkr,Typ,Attchd,Fin,TA,TA,Y,Normal
3,9978,78.0,926.0,3,1,0,2.0,360,0,RL,...,Unf,Ex,SBrkr,Typ,Attchd,Fin,TA,TA,Y,Normal
4,5005,43.0,1280.0,2,0,0,2.0,0,144,RL,...,Unf,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,Normal


In [53]:
tree_test_pred = tree_search.predict(test_cp)
tree_test_pred

array([0, 0, 0, ..., 0, 0, 0])

In [54]:
tree_test = pd.DataFrame({"Id": test["Id"], "Expensive": tree_test_pred})
tree_test

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [55]:
tree_test["Expensive"].value_counts()

0    1260
1     199
Name: Expensive, dtype: int64

In [56]:
tree_test.to_csv("Preston_Submission_1.csv", index = False)

In [57]:
knn_test_pred = knn_search.predict(test_cp)
knn_test_pred

array([0, 0, 0, ..., 0, 0, 0])

In [58]:
knn_test = pd.DataFrame({"Id": test["Id"], "Expensive": knn_test_pred})
knn_test

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [59]:
knn_test["Expensive"].value_counts()

0    1289
1     170
Name: Expensive, dtype: int64

In [60]:
knn_test.to_csv("Preston_Submission_2.csv", index = False)

In [61]:
logreg_test_pred = logreg_search.predict(test_cp)
logreg_test_pred

array([0, 0, 0, ..., 0, 0, 0])

In [62]:
logreg_test = pd.DataFrame({"Id": test["Id"], "Expensive": logreg_test_pred})
logreg_test

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [63]:
logreg_test["Expensive"].value_counts()

0    1255
1     204
Name: Expensive, dtype: int64

In [64]:
logreg_test.to_csv("Preston_Submission_3.csv", index = False)

In [65]:
svm_test_pred = svm_search.predict(test_cp)
svm_test_pred

array([0, 0, 0, ..., 0, 0, 0])

In [66]:
svm_test = pd.DataFrame({"Id": test["Id"], "Expensive": svm_test_pred})
svm_test

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [67]:
svm_test["Expensive"].value_counts()

0    1244
1     215
Name: Expensive, dtype: int64

In [68]:
svm_test.to_csv("Preston_Submission_4.csv", index = False)

In [78]:
forest_test_pred = forest_search.predict(test_cp)
forest_test_pred

array([0, 0, 0, ..., 0, 0, 0])

In [79]:
forest_test = pd.DataFrame({"Id": test["Id"], "Expensive": forest_test_pred})
forest_test

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [80]:
forest_test["Expensive"].value_counts()

0    1282
1     177
Name: Expensive, dtype: int64

In [81]:
forest_test.to_csv("Preston_Submission_7.csv", index = False)