In [1]:
import pandas as pd

from os import getcwd
from os.path import join
from sklearn.linear_model import (
    Ridge,
    LinearRegression,
    RidgeClassifier,
    LogisticRegression,
    SGDClassifier,
    SGDRegressor,
    PassiveAggressiveClassifier,
    PassiveAggressiveRegressor,
)
from sklearn.gaussian_process import (
    GaussianProcessClassifier,
    GaussianProcessRegressor,
)
from sklearn.neighbors import (
    KNeighborsClassifier,
    KNeighborsRegressor,
)
from sklearn.naive_bayes import (
    MultinomialNB,
)
from sklearn.metrics import (
    mean_absolute_error,
    f1_score,
    precision_score,
    accuracy_score,
    recall_score,
    r2_score,
)
from sklearn.svm import (
    SVC,
    SVR,
)
from sklearn.tree import (
    DecisionTreeClassifier,
    DecisionTreeRegressor,
)
from sklearn.ensemble import (
    GradientBoostingClassifier,
    GradientBoostingRegressor,
    AdaBoostClassifier,
    AdaBoostRegressor,
    RandomForestRegressor,
    RandomForestClassifier,
)
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from timeit import timeit, default_timer
from sklearn.preprocessing import (
    MinMaxScaler,
    StandardScaler,
)
from sklearn.decomposition import PCA

In [2]:
pwd = getcwd()
file = join(pwd, '../Datasets/kc_house_data.csv')
df = pd.read_csv(file)

In [3]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
df.shape

(21613, 21)

In [5]:
def process(df):
    df = df.drop('id', axis=1)
    df = df.drop('date', axis=1)
    df = df.drop('zipcode', axis=1)
    df = df.drop('lat', axis=1)
    df = df.drop('long', axis=1)
    df = df.drop('yr_renovated', axis=1)
    #df['date'] = df['date'].values.astype('float')
    train, test = train_test_split(df, test_size=.3, random_state=1)
    X_train = train.drop('price', axis=1)
    Y_train = train['price']
    X_test = test.drop('price', axis=1)
    Y_test = test['price']
    return X_train, Y_train, X_test, Y_test

In [6]:
class NNARegressor:
    algorithms = [
        'linear_regression',
        'svm',
        'logistic',
        'decision_tree',
    ]
    __models = {
        'classification': {},
        'regression': {},
    }
    __models['regression'] = {
        #'linear_regression': Ridge(),
        #'svm': SVR(),
        #'logistic': LogisticRegression(),
        'decision_tree': DecisionTreeRegressor(),
        #'gpr': GaussianProcessRegressor(),
        #'mnb': MultinomialNB(),
        #'gboost': GradientBoostingRegressor(),
        #'knn': KNeighborsRegressor(),
        #'paa': PassiveAggressiveRegressor(),
        #'rf': RandomForestRegressor(),
        #'adaboost': AdaBoostRegressor(),
        #'mlp': MLPRegressor(solver='lbfgs', activation='relu', early_stopping=True,)
    }
    __models['classification'] = {
        'linear_regression': RidgeClassifier(),
        'svm': SVC(),
        'decision_tree': DecisionTreeClassifier(),
        'adaboost': AdaBoostClassifier(),
        'gboost': GradientBoostingClassifier(),
        'knn': KNeighborsClassifier(),
    }
    __model = None
    X_ = None
    scaler = MinMaxScaler()
    standardizer = StandardScaler()
    pca = PCA(n_components=3)
    
    def train(self, X, Y):
        _X = X.to_numpy()
        #_X = self.standardizer.fit_transform(_X)
        #_X = self.pca.fit_transform(_X)
        rows = []
        for name, model in self.__models['regression'].items():
            start = default_timer()
            self.__models['regression'][name] = model.fit(_X, Y)
            end = default_timer()
            print(name, end-start)
        
    def transform(self, X):
        df = pd.DataFrame()
        #X = self.standardizer.fit_transform(X)
        #X = self.pca.fit_transform(X)
        for name, model in self.__models['regression'].items():
            df[name] = model.predict(X)
        self.X_ = df
        return df
    
    def fit(self, X, Y):
        self.train(X, Y)
        _X = self.transform(X)
        _
        mlp = MLPRegressor(
            max_iter=500,
            random_state=1,
            hidden_layer_sizes=(100,X.shape[1]),
            solver='lbfgs',
            activation='relu',
        )
        sgd = SGDRegressor(
            loss='huber',
            penalty='l2',
        )
        model = mlp.fit(_X, Y)
        
        self.__model = model
        return model
    
    def predict(self, X):
        _X = self.transform(X)
        return self.__model.predict(_X)
    
    def fit_predict(self, X, Y):
        self.fit(X, Y)
        return self.predict(X)

In [7]:
class Bagging:
    algorithms = [
        'linear_regression',
        'svm',
        'logistic',
        'decision_tree',
    ]
    __models = {
        'classifier': {},
        'regression': {},
    }
    __models['regression'] = {
        'linear_regression': Ridge(),
        'svm': SVR(),
        'logistic': LogisticRegression(),
        'decision_tree': DecisionTreeRegressor(),
    }

In [8]:
nna = NNARegressor()

In [9]:
X_train, Y_train, X_test, Y_test = process(df)

In [10]:
X_train.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,sqft_living15,sqft_lot15
2060,3,2.0,1540,6250,1.0,0,0,3,7,1540,0,1998,2070,6250
11759,2,1.0,810,7158,1.0,0,0,5,6,810,0,1944,1090,7158
8274,3,1.0,860,10426,1.0,0,0,3,6,860,0,1954,1140,11250
21272,4,2.5,1750,4779,2.0,0,0,3,7,1750,0,2009,1580,4687
14328,3,1.0,1410,9656,1.0,0,0,3,7,960,450,1953,2410,9384


In [11]:
print(X_train.shape)

(15129, 14)


In [12]:
predicted = nna.fit_predict(X_train, Y_train)

decision_tree 0.12509710000000007


In [13]:
def mape(y, y_p):
    error = 0
    for a, b in zip(y, y_p):
        e = abs(a-b)
        e /= (a)
        #print(a, b)
        error += e
    error /= len(y)
    error *= 100
    return error

In [14]:
print(mape(Y_train, predicted))

0.278063572928572


In [15]:
for col in nna.X_.columns.tolist():
    print(col, mape(Y_train, nna.X_[col]))

decision_tree 0.2780093347555692
