In [38]:
import os
import scipy
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline
np.random.seed(42)

<h1>Importing the Dataset:

In [2]:
%%time
wakeCounty = pd.read_csv("WakeCountyHousing.csv")
wakeCounty = wakeCounty[['Real_Estate_Id','Deeded_Acreage','Year_of_Sale','Physical_Zip',
                         'Year_Built','Physical_City','Total_Sale_Date','Month_Year_of_Sale',
                         'Year_Remodeled','Heated_Area','Num_Stories','Design_Style','Bath',
                         'Utilities','Total_Sale_Price']]
wakeCounty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308292 entries, 0 to 308291
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Real_Estate_Id      308292 non-null  int64  
 1   Deeded_Acreage      308292 non-null  float64
 2   Year_of_Sale        308292 non-null  int64  
 3   Physical_Zip        308146 non-null  float64
 4   Year_Built          308292 non-null  int64  
 5   Physical_City       308183 non-null  object 
 6   Total_Sale_Date     308292 non-null  object 
 7   Month_Year_of_Sale  308292 non-null  object 
 8   Year_Remodeled      308292 non-null  int64  
 9   Heated_Area         308292 non-null  int64  
 10  Num_Stories         308292 non-null  object 
 11  Design_Style        308292 non-null  object 
 12  Bath                308275 non-null  object 
 13  Utilities           306324 non-null  object 
 14  Total_Sale_Price    308292 non-null  int64  
dtypes: float64(2), int64(6), object(7)

<h1>Cleaning the Data and Data Preparation:

In [3]:
#Dropping all the columns that have NaN as a part of them
wakeCounty = wakeCounty.dropna()
wakeCounty.head()

Unnamed: 0,Real_Estate_Id,Deeded_Acreage,Year_of_Sale,Physical_Zip,Year_Built,Physical_City,Total_Sale_Date,Month_Year_of_Sale,Year_Remodeled,Heated_Area,Num_Stories,Design_Style,Bath,Utilities,Total_Sale_Price
0,19,0.21,1974,27610.0,1964,Raleigh,1/1/1974,January 1974,1964,1828,One Story,Split level,2 Bath,ALL,34500
1,20,0.46,1983,27610.0,1970,Raleigh,5/18/1983,May 1983,1970,1240,One Story,Conventional,1 Bath,E,35500
2,22,0.46,2004,27591.0,1900,Wendell,9/16/2004,September 2004,1900,2261,One Story,Conventional,2 Bath,WSE,37500
3,25,0.96,1971,27613.0,1971,Raleigh,1/1/1971,January 1971,1971,3770,One Story,Conventional,Other,WGE,70000
4,30,0.47,2015,27607.0,1946,Raleigh,8/12/2015,August 2015,2017,1789,One Story,Conventional,2 Bath,ALL,380000


In [4]:
wakeCounty = wakeCounty.drop(['Month_Year_of_Sale','Total_Sale_Date','Design_Style',
                              'Real_Estate_Id','Physical_City','Total_Sale_Date','Month_Year_of_Sale',
                              'Heated_Area','Design_Style','Utilities'], axis=1)

In [5]:
wakeCounty.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306161 entries, 0 to 308291
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Deeded_Acreage    306161 non-null  float64
 1   Year_of_Sale      306161 non-null  int64  
 2   Physical_Zip      306161 non-null  float64
 3   Year_Built        306161 non-null  int64  
 4   Year_Remodeled    306161 non-null  int64  
 5   Num_Stories       306161 non-null  object 
 6   Bath              306161 non-null  object 
 7   Total_Sale_Price  306161 non-null  int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 21.0+ MB


In [6]:
wakeCounty['Num_Stories'] = wakeCounty['Num_Stories'].replace(['One Story'],1)
wakeCounty['Num_Stories'] = wakeCounty['Num_Stories'].replace(['Two Story'],2)
wakeCounty['Num_Stories'] = wakeCounty['Num_Stories'].replace(['Other'],0)
wakeCounty['Bath'] = wakeCounty['Bath'].replace(['2 Bath'],2)
wakeCounty['Bath'] = wakeCounty['Bath'].replace(['1 Bath'],1)
wakeCounty['Bath'] = wakeCounty['Bath'].replace(['Other'],0)
wakeCounty['Bath'] = wakeCounty['Bath'].replace(['3 Bath'],3)
wakeCounty['Bath'] = wakeCounty['Bath'].replace(['3½ Bath'],3.5)
wakeCounty['Bath'] = wakeCounty['Bath'].replace(['1 ½ Bath'],1.5)
wakeCounty['Bath'] = wakeCounty['Bath'].replace(['2½ Bath'],2.5)

In [7]:
def categorise(row):  
    if row['Year_Built'] == row['Year_Remodeled']:
        return "NO"
    else:
        return "YES"
    
wakeCounty['Is_Remodeled'] = wakeCounty.apply(lambda row: categorise(row), axis=1)

remodeled_cat = wakeCounty["Is_Remodeled"]
remodeled_cat = np.array(remodeled_cat).reshape(-1, 1)
ordinal_encoder = OrdinalEncoder()
remodeled_cat_encoded = ordinal_encoder.fit_transform(remodeled_cat)
remodeled_cat_encoded[:10]

array([[0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

<h1>Splitting into Test and Training Data:

In [8]:
X_train, X_test, y_train, y_test = train_test_split(wakeCounty.drop(['Is_Remodeled'], axis=1), 
                                                    remodeled_cat_encoded, test_size=0.3, random_state=42)
X_train = X_train[:1000]
y_train = y_train[:1000]
X_test = X_test[:400]
y_test = y_test[:400]

<h1>SGD Classification of the Data:

In [9]:
clf = SGDClassifier(random_state=42, max_iter=100)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy: ",accuracy_score(y_test, y_pred))

Accuracy:  0.9825


  return f(**kwargs)


<h1>Random Forest Classification:

In [10]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)
print("Accuracy: ",accuracy_score(y_test, y_pred))

  rnd_clf.fit(X_train, y_train)


Accuracy:  0.9825


<h1>K Neighbors Classifier:

In [14]:
features = wakeCounty.drop(['Is_Remodeled','Year_Built','Year_Remodeled'], axis=1)
label = wakeCounty['Is_Remodeled']
features

Unnamed: 0,Deeded_Acreage,Year_of_Sale,Physical_Zip,Num_Stories,Bath,Total_Sale_Price
0,0.21,1974,27610.0,1,2.0,34500
1,0.46,1983,27610.0,1,1.0,35500
2,0.46,2004,27591.0,1,2.0,37500
3,0.96,1971,27613.0,1,0.0,70000
4,0.47,2015,27607.0,1,2.0,380000
...,...,...,...,...,...,...
308287,0.19,2021,27540.0,0,3.0,440000
308288,0.19,2021,27540.0,1,3.5,464000
308289,0.19,2021,27540.0,0,3.0,431000
308290,4.42,2021,27614.0,1,1.0,100000


In [15]:
model = KNeighborsClassifier(n_neighbors=9)

# Train the model using the training sets
model.fit(features,label)

#Predict Output
predicted= model.predict([[.23,2000,27591.0,2,3.0, 400000]])
print(predicted)


['NO']


<h1>Using the OvO Classifier:

In [16]:
ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0)).fit(X_train, y_train)
ovo_clf.predict(X_test[:10])

  return f(**kwargs)


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [29]:
wakeCounty.Total_Sale_Price.describe()

count    3.061610e+05
mean     2.732519e+05
std      1.897399e+05
min      0.000000e+00
25%      1.500000e+05
50%      2.350000e+05
75%      3.500000e+05
max      6.100200e+06
Name: Total_Sale_Price, dtype: float64

In [33]:
def findCat(row):
    if row['Total_Sale_Price'] <= 100000:
        return "<100000"
    elif row['Total_Sale_Price'] <= 200000:
        return "100000-200000"
    elif row['Total_Sale_Price'] <= 300000:
        return "200000-300000"
    elif row['Total_Sale_Price'] <= 400000:
        return "300000-400000"
    else:
        return ">400000"
wakeCounty['Total_Sale_Price_Cat'] = wakeCounty.apply(lambda row: findCat(row), axis=1)

In [36]:
labels = wakeCounty['Total_Sale_Price_Cat']
params = wakeCounty.drop(['Total_Sale_Price_Cat','Total_Sale_Price'], axis=1)


In [41]:
params.drop(['Is_Remodeled'], axis=1, inplace=True)

In [45]:
model = RandomForestClassifier(n_estimators=20)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
cv_results = cross_val_score(model, params, labels, cv=kfold, scoring='accuracy')

In [46]:
cv_results.mean()

0.7089635873290395