In [18]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

import pandas.plotting as pd_plot
import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

from sklearn.ensemble import RandomForestRegressor
import sklearn.model_selection as ModelSelection
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold, cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
def align_expand_and_fill(df_a, df_b):
    df_a = df_a[df_b.columns.intersection(df_a.columns)]
    missing_columns = [col for col in df_b.columns if col not in df_a.columns]
    for col in missing_columns:
        df_a[col] = 0
    df_a = df_a[df_b.columns]
    return df_a
    
# importing local modules is pain, so base code located here.
class GenericTrainDataset:
    def __init__(self, df):
        self._df = df

    def _split_numeric(self, df):
        numeric_df = df.select_dtypes(include=['number'])
        non_numeric_df = df.select_dtypes(exclude=['number'])
        return numeric_df, non_numeric_df

    def _impute_nums(self, df):
        num, non_num = self._split_numeric(df)
        imputer = SimpleImputer(strategy="median")
        imputer.fit(num)
        X = imputer.transform(num)
        num_imputed = pd.DataFrame(X, columns=num.columns, index=num.index)
        return pd.concat([non_num, num_imputed], axis=1)

    def _prepare_non_nums(self, df):
        num, non_num = self._split_numeric(df)
        return pd.concat([num, pd.get_dummies(non_num)], axis=1)

    def _scale_features(self, df):
        return pd.DataFrame(StandardScaler().fit_transform(df.copy()), columns=df.columns, index=df.index)
    
    def prepare(self, target, test):
        prepared = self._prepare_non_nums(self._impute_nums(self._df))
        if test:
            features = self._scale_features(prepared)
            return features
        features = self._scale_features(prepared.drop(columns=[target]))
        labels = prepared[target]
        return features, labels


class Data:
    def __init__(self):
        raw_data = pd.read_csv("./data/train.csv")
        self._final_test = self.__prepare(pd.read_csv("./data/test.csv"))
        self._train = self.__prepare(raw_data)
    
    def __prepare(self, rawDf):
        df = rawDf.copy()
        df = df.drop(columns=["Id"])
        df = self.__gen_data(df)
        return df

    def __gen_data(self, df):
        df["Age"] = df["YearBuilt"].max()-df["YearBuilt"]
        return df.drop(columns=["YearBuilt"])

    
    def _info(self):
        print(self._train.info())
        print(self._train.describe())

    def _show(self):
        self._train.hist(bins=50, figsize=(20,15))
        plt.show()
        
    def _correlation(self):
        corr_matrix = self._train.corr(numeric_only=True)
        print("================correlation(abs)================\n\n")
        print(corr_matrix["SalePrice"].abs().sort_values(ascending=False))
        print("================correlation(abs)================\n\n")

    def _get_most_correlated_attrs(self):
        corr_matrix = self._train.corr(numeric_only=True)
        sorted = corr_matrix["SalePrice"].abs().sort_values(ascending=False).head(10)
        return [(index) for index in sorted.index]
    
    def _scatter(self):
        print("================top 10 correlated(abs) features scatter pl================\n\n")
        attrs = self._get_most_correlated_attrs()
        pd_plot.scatter_matrix(self._train[attrs], figsize=(20,10))
        
    
    def describe(self):
        self._info()
        self._show()
        self._correlation()
        self._scatter()


    def make_train_frame(self):
        df = self._train
        # df = self._train[(self._train["SalePrice"] >= 50000) & (self._train["SalePrice"] <= 400000)]
        dataset = GenericTrainDataset(df)
        features, labels = dataset.prepare("SalePrice", False)
        return features, labels

    def make_test_frame(self):
        df = self._final_test
        # df = self._test[(self._test["SalePrice"] >= 50000) & (self._test["SalePrice"] <= 400000)]
        dataset = GenericTrainDataset(df)
        return dataset.prepare("SalePrice", True)


class Model:
    # Data should not contain NaN's
    def __init__(self, data):
        self._data = data

    def test_prediction(self, RegressorModel):
        featured_df, labels = self._data.make_train_frame()
        reg=RegressorModel(l1_ratio=0.2)
        reg.fit(featured_df, labels)
        featured_test_df = self._data.make_test_frame()

        ds = align_expand_and_fill(featured_test_df, featured_df).sort_index()
        prediction = reg.predict(ds)
        df = pd.DataFrame({})
        df['SalePrice']=prediction
        df['Id'] = df.index + 1461
        return df
    
    def simple_train_and_cv(self, RegressorModel):
        featured_df, labels = self._data.make_train_frame()
        reg=RegressorModel()
        reg.fit(featured_df, labels)
        prediction = reg.predict(featured_df)
        lin_mse = mean_squared_error(labels, prediction)
        lin_rmse = np.sqrt(lin_mse)
        
        print("================{}================\n\n".format(RegressorModel))
        predictions = reg.predict(featured_df)
        mse = mean_squared_error(labels, predictions)
        rmse = np.sqrt(mse)
        print("RMSE on validation set: {}".format(rmse))
        self._cross_validate(reg)
        print("\n\n================{}================\n\n".format(RegressorModel))

    def _cross_validate(self, reg):
        featured_df, labels = self._data.make_train_frame()
        category_bins = pd.cut(labels, bins=10, labels=False)  
        skf = StratifiedKFold(n_splits=20)
        generator = skf.split(labels, category_bins)
        scores = cross_val_score(reg, featured_df, labels, scoring="neg_mean_squared_error", cv=generator)
        rmse_scores = np.sqrt(np.log(-scores))
        self._display_scores(rmse_scores)

    def _display_scores(self, scores):
        print("\nCV errors(lower is better):", scores)
        print("CV errors Mean(lower is better):", scores.mean())
        print("CV errors Standard deviation(lower is better): {} \n".format(scores.std()))
        
data = Data()
data.describe()
model = Model(data)
model.simple_train_and_cv(ElasticNet)
model.simple_train_and_cv(DecisionTreeRegressor)
model.simple_train_and_cv(RandomForestRegressor)
print("start")
model.test_prediction(ElasticNet).to_csv('result.csv', index=False)
print("ok")