In [1]:
import pandas as pd

from os import getcwd
from os.path import join
from sklearn.linear_model import (
    Ridge,
    LinearRegression,
    RidgeClassifier,
    LogisticRegression,
    SGDClassifier,
    SGDRegressor,
)
from sklearn.neighbors import (
    KNeighborsClassifier,
    KNeighborsRegressor,
)
from sklearn.naive_bayes import (
    MultinomialNB,
)
from sklearn.metrics import (
    mean_absolute_error,
    f1_score,
    precision_score,
    accuracy_score,
    recall_score,
)
from sklearn.svm import (
    SVC,
    SVR,
)
from sklearn.tree import (
    DecisionTreeClassifier,
    DecisionTreeRegressor,
)
from sklearn.ensemble import (
    GradientBoostingClassifier,
    GradientBoostingRegressor,
    AdaBoostClassifier,
    AdaBoostRegressor,
)
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split

In [2]:
pwd = getcwd()
file = join(pwd, '../Datasets/kc_house_data.csv')
df = pd.read_csv(file)

In [3]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
def process(df):
    df = df.drop('id', axis=1)
    df = df.drop('date', axis=1)
    df = df.drop('zipcode', axis=1)
    df = df.drop('lat', axis=1)
    df = df.drop('long', axis=1)
    df = df.drop('yr_renovated', axis=1)
    #df['date'] = df['date'].values.astype('float')
    train, test = train_test_split(df, test_size=.3, random_state=1)
    X_train = train.drop('price', axis=1)
    Y_train = train['price']
    X_test = test.drop('price', axis=1)
    Y_test = test['price']
    return X_train, Y_train, X_test, Y_test

In [5]:
class NNARegressor:
    algorithms = [
        'linear_regression',
        'svm',
        'logistic',
        'decision_tree',
    ]
    __models = {
        'classification': {},
        'regression': {},
    }
    __models['regression'] = {
        'linear_regression': Ridge(),
        'svm': SVR(),
        #'logistic': LogisticRegression(),
        'decision_tree': DecisionTreeRegressor(),
        #'mnb': MultinomialNB(),
        'gboost': GradientBoostingRegressor(),
        'knn': KNeighborsRegressor(),
        'adaboost': AdaBoostRegressor(),
    }
    __models['classification'] = {
        'linear_regression': RidgeClassifier(),
        'svm': SVC(),
        'decision_tree': DecisionTreeClassifier(),
        'adaboost': AdaBoostClassifier(),
        'gboost': GradientBoostingClassifier(),
        'knn': KNeighborsClassifier(),
    }
    __mlp = None
    X_ = None
    
    def train(self, X, Y):
        _X = X.to_numpy()
        rows = []
        for name, model in self.__models['regression'].items():
            self.__models['regression'][name] = model.fit(X, Y)
        
    def transform(self, X):
        df = pd.DataFrame()
        for name, model in self.__models['regression'].items():
            df[name] = model.predict(X)
        self.X_ = df
        return df
    
    def fit(self, X, Y):
        self.train(X, Y)
        _X = self.transform(X)
        mlp = MLPRegressor(
            max_iter=500,
            random_state=1,
            hidden_layer_sizes=(10,10),
            solver='adam',
        )
        mlp = mlp.fit(_X, Y)
        self.__mlp = mlp
        return mlp
    
    def predict(self, X):
        _X = self.transform(X)
        return self.__mlp.predict(_X)
    
    def fit_predict(self, X, Y):
        self.fit(X, Y)
        return self.predict(X)

In [6]:
class Bagging:
    algorithms = [
        'linear_regression',
        'svm',
        'logistic',
        'decision_tree',
    ]
    __models = {
        'classifier': {},
        'regression': {},
    }
    __models['regression'] = {
        'linear_regression': Ridge(),
        'svm': SVR(),
        'logistic': LogisticRegression(),
        'decision_tree': DecisionTreeRegressor(),
    }

In [7]:
nna = NNARegressor()

In [8]:
X_train, Y_train, X_test, Y_test = process(df)

In [9]:
X_train.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,sqft_living15,sqft_lot15
2060,3,2.0,1540,6250,1.0,0,0,3,7,1540,0,1998,2070,6250
11759,2,1.0,810,7158,1.0,0,0,5,6,810,0,1944,1090,7158
8274,3,1.0,860,10426,1.0,0,0,3,6,860,0,1954,1140,11250
21272,4,2.5,1750,4779,2.0,0,0,3,7,1750,0,2009,1580,4687
14328,3,1.0,1410,9656,1.0,0,0,3,7,960,450,1953,2410,9384


In [10]:
predicted = nna.fit_predict(X_train, Y_train)

In [11]:
def mape(y, y_p):
    error = 0
    for a, b in zip(y, y_p):
        e = abs(a-b)
        e /= abs(a)
        #print(a, b)
        error += e
    error /= len(y)
    error *= 100
    return error

In [12]:
diff = Y_train.max()-Y_train.min()
print(mean_absolute_error(predicted, Y_train)/diff)

0.0003670550607262772


In [13]:
print(mape(Y_train, predicted))

0.5435440148016111


In [14]:
for col in nna.X_.columns.tolist():
    print(col, mean_absolute_error(nna.X_[col], Y_train)/diff)

linear_regression 0.02500536506078671
svm 0.039888905378970205
decision_tree 0.00015499284858136315
gboost 0.020984838155970548
knn 0.023223449505619585
adaboost 0.0473705046061242
