In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
trainRaw = pd.read_csv("../input/house-price-prediction-challenge/train.csv")
testRaw = pd.read_csv("../input/house-price-prediction-challenge/test.csv")

In [None]:
trainRaw.head(3)

In [None]:
trainRaw.dtypes

In [None]:
trainRaw.describe()

Initial thoughts on columns:
* posted_by -- categorial, Owner/Dealer/?. May be useful, to be encoded
* under_construction -- binary, no feature eng needed
* rera -- binary, "rera approval", no feature eng needed
* bhk_no -- room count, no feature eng needed
* square_ft -- area, nfen
* ready_to_move -- binary, nfen
* resale -- binary, nfen
* address -- text, probably of no use
* lon/lat -- could be useful but would probably require some feature engineering first? Visualisation desired
* target -- obv

In [None]:
oheCol = lambda colN, pref : lambda origDf : pd.concat([origDf.drop(colN, axis = 'columns'), pd.get_dummies(origDf[colN], prefix = pref)], axis = 'columns')
drpCol = lambda colN : lambda origDf : origDf.drop(colN, axis = 'columns')
renCol = lambda renD : lambda origDf : origDf.rename(columns = renD)
from functools import reduce
applyPipes = lambda sourceDf, pipes: reduce(lambda df, func: df.pipe(func), pipes, sourceDf)

In [None]:
prepPipes = [oheCol('POSTED_BY', None), oheCol('BHK_OR_RK', None), drpCol('ADDRESS'), renCol({'TARGET(PRICE_IN_LACS)': 'target'})]
trainPrep = applyPipes(trainRaw, prepPipes)

In [None]:
corrM = trainPrep.corr()
print(corrM['target'].sort_values())

In [None]:
sns.heatmap(corrM)

The heatmap mostly shows internal consistenty -- ready to move or resale negatively correlates with under construction, etc

Taken in isolation, best features would be square feet, not-resale, room count, sold-by-builder, not-sold-by-owner

In [None]:
sns.scatterplot(x=trainPrep['LATITUDE'], y=trainPrep['LONGITUDE'])

The india in there is quite obvious. We may want to train some clustering to give us features

In [None]:
geoClusters = 20
from sklearn.cluster import KMeans
clusteringMod = KMeans(n_clusters=geoClusters, init='k-means++')
geoX = trainPrep.loc[:, ['LATITUDE', 'LONGITUDE']]
trainPrepGc = trainPrep.assign(geoCluster=clusteringMod.fit_predict(geoX))
sns.scatterplot(x=trainPrep['LATITUDE'], y=trainPrep['LONGITUDE'], hue=trainPrepGc['geoCluster'])

In [None]:
applyModel = lambda model, targetCol, inputCols: lambda df: df.assign(**{targetCol:model.predict(df.loc[:, inputCols])})
prepPipes2 = [applyModel(clusteringMod, 'geoCluster', ['LATITUDE', 'LONGITUDE']),
              oheCol('geoCluster', 'geoCluster_'),
              drpCol('LATITUDE'), drpCol('LONGITUDE')]
trainPrep2 = applyPipes(trainPrep, prepPipes2)

In [None]:
geoColsCorr = list(filter(lambda s: s.startswith('geo'), trainPrep2.columns)) + ['target']
trainPrep2.loc[:, geoColsCorr].corr()['target']

In [None]:
X = trainPrep2.drop('target', axis = 'columns')
y = trainPrep2.target

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score

scoringStrategy = 'neg_mean_absolute_error'

def evalModel(model):
    pipeline = Pipeline(steps=[('model', model)
                             ])
    return evalModelPipeline(pipeline)

def evalModelPreproc(preproc, model):
    pipeline = Pipeline(steps = [('preproc', preproc), ('model', model)])
    return evalModelPipeline(pipeline)

def evalModelPipeline(pipeline):
    scores = -1 * cross_val_score(pipeline, X, y,
                              cv=5,
                              scoring=scoringStrategy)
                              # scoring='neg_mean_squared_log_error') # TODO the function from the competition
    print(scores)
    return scores

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler


stdScaler = StandardScaler()
scalerPreproc = ColumnTransformer(
    transformers=[
        ('scaler', stdScaler, ['BHK_NO.', 'SQUARE_FT'])
    ], remainder = 'passthrough')


mmScaler = MinMaxScaler()
scalerMMPreproc = ColumnTransformer(
    transformers=[
        ('scaler', mmScaler, ['BHK_NO.', 'SQUARE_FT'])
    ], remainder = 'passthrough')

allScaler = StandardScaler()
scalerAllPreproc = ColumnTransformer(
    transformers = [],
    remainder = allScaler
)

In [None]:
from sklearn.dummy import DummyRegressor
evalModel(DummyRegressor())

In [None]:
from sklearn.linear_model import LinearRegression
evalModel(LinearRegression())

In [None]:
evalModelPreproc(scalerPreproc, LinearRegression())

In [None]:
evalModelPreproc(scalerMMPreproc, LinearRegression())

In [None]:
from sklearn.compose import TransformedTargetRegressor
evalModelPreproc(scalerMMPreproc, TransformedTargetRegressor(regressor=LinearRegression(), transformer=MinMaxScaler()))

In [None]:
evalModelPreproc(allScaler, TransformedTargetRegressor(regressor=HuberRegressor(), transformer = MinMaxScaler()))

In [None]:
from sklearn.ensemble import RandomForestRegressor
evalModel(RandomForestRegressor())

In [None]:
evalModelPreproc(scalerPreproc, RandomForestRegressor())

In [None]:
from sklearn.model_selection import GridSearchCV

randomForestParams = {'n_estimators': [40, 160, 320]}
gsRandomForest = GridSearchCV(RandomForestRegressor(),
                              randomForestParams, scoring = scoringStrategy)

gsRandomForest.fit(X, y)
print(gsRandomForest.cv_results_['mean_test_score'])
print(gsRandomForest.best_params_)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
evalModel(GradientBoostingRegressor())

In [None]:
evalModelPreproc(scalerPreproc, GradientBoostingRegressor())

In [None]:
evalModelPreproc(scalerMMPreproc, GradientBoostingRegressor())

In [None]:
gradientBoostingParams = {"n_estimators": [10, 100, 200], "learning_rate": [0.01, 0.1, 1.0], "max_depth": [1, 3, 10]}
gsGradientBoosting = GridSearchCV(GradientBoostingRegressor(), gradientBoostingParams, scoring = scoringStrategy)

gsGradientBoosting.fit(X, y)
print(gsGradientBoosting.cv_results_['mean_test_score'])
print(gsGradientBoosting.best_params_)

In [None]:
perhapsGoodModel = RandomForestRegressor(n_estimators = 320)
perhapsGoodModel.fit(X, y)

In [None]:
testPrep2 = applyPipes(testRaw, prepPipes+prepPipes2)
perhapsOutput = perhapsGoodModel.predict(testPrep2)

In [None]:
pd.Series(perhapsOutput).to_csv("sub01-naive_randomForest.csv", header = ["TARGET(PRICE_IN_LACS)"], index = False)

In [None]:
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.metrics import mean_absolute_error
import graphviz
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.model_selection import train_test_split
from IPython.core.display import display

def plotTree():
    treeModel = DecisionTreeRegressor(min_samples_leaf = 4, max_depth = 5)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.9)
    treeModel.fit(X_train, y_train)
    print(mean_absolute_error(y_test, treeModel.predict(X_test)))
    perm = PermutationImportance(treeModel).fit(X_test, y_test)
    display(eli5.show_weights(perm, feature_names = X.columns.tolist()))
    # plot_tree(treeModel)
    treeGraph = export_graphviz(treeModel, out_file=None, feature_names=X.columns)
    graph = graphviz.Source(treeGraph)
    return graph
plotTree()

In [None]:
def getSplitSizeStats(test_size, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)
    model.fit(X_train, y_train)
    return[test_size, mean_absolute_error(y_train, model.predict(X_train)), mean_absolute_error(y_test, model.predict(X_test))]
[getSplitSizeStats(j, DecisionTreeRegressor(min_samples_leaf = 4, max_depth = 5)) for j in np.linspace(0.01, 0.9, 20)]

In [None]:
[getSplitSizeStats(j, LinearRegression()) for j in np.linspace(0.01, 0.9, 20)]

In [None]:
pd.DataFrame([getSplitSizeStats(j, LinearRegression()) for j in [0.2]*40]).describe()

In [None]:
p = Pipeline(steps = [('preproc', scalerPreproc), ('model', LinearRegression())])
[getSplitSizeStats(j, p) for j in np.linspace(0.01, 0.9, 20)]

In [None]:
sns.kdeplot(data = trainPrep2.query('SQUARE_FT < 1500 & target < 400'), x='SQUARE_FT', y = 'target')

In [None]:
sns.kdeplot(data = trainPrep2.query('SQUARE_FT < 3000'), x='SQUARE_FT')

In [None]:
sns.lineplot(data = trainPrep2.query('SQUARE_FT < 3000 & target < 2000').sample(frac=0.1), x='SQUARE_FT', y = 'target')

In [None]:
sns.lineplot(data = trainPrep2.query('SQUARE_FT > 3000 & SQUARE_FT < 6000').sample(frac=0.1), x='SQUARE_FT', y = 'target')

In [None]:
sns.lineplot(data = trainPrep2.query('SQUARE_FT < 3000 & target > 2000'), x='SQUARE_FT', y = 'target')

In [None]:
sns.kdeplot(data = trainPrep2.query('target < 600'), x='target')

In [None]:
sns.kdeplot(data = np.log10(trainPrep2.target))

In [None]:
trainPrep2.query('target < 1000').describe()

In [None]:
trainPrep2.query('target >= 1000').describe()

In [None]:
treeModel = RandomForestRegressor(n_estimators = 320)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8)
train = trainPrep2.sample(frac=0.4)
test = trainPrep2.sample(frac=0.1)
X_train, X_test, y_train, y_test = train.drop('target', axis = 'columns'), test.drop('target', axis = 'columns'), train.target, test.target
treeModel.fit(X_train, y_train)
print(mean_absolute_error(y_test, treeModel.predict(X_test)))
perm = PermutationImportance(treeModel).fit(X_test, y_test)
display(eli5.show_weights(perm, feature_names = X.columns.tolist()))

In [None]:
linModel = LinearRegression()
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8)
train = pd.concat([trainPrep2.sample(frac=0.4), trainPrep2.query('target > 1000').sample(frac=0.5)])
test = pd.concat([trainPrep2.sample(frac=0.1), trainPrep2.query('target > 1000').sample(frac=0.5)])
X_train, X_test, y_train, y_test = train.drop('target', axis = 'columns'), test.drop('target', axis = 'columns'), train.target, test.target
linModel.fit(X_train, y_train)
print(mean_absolute_error(y_test, linModel.predict(X_test)))
print(mean_absolute_error(y_train, linModel.predict(X_train)))

In [None]:
trainPrep3 = trainPrep2.assign(isExp=(trainPrep2.target > 1000).astype(int))
trainPrep3.loc[:, 'isExp'].describe()

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.metrics import f1_score
def plotTree2(data):
    # treeModel = DecisionTreeRegressor(min_samples_leaf = 4, max_depth = 5)
    treeModel = DecisionTreeClassifier(min_samples_leaf = 4, max_depth = 5)
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.9)
    train = pd.concat([data.sample(frac=0.4), data.query('target > 1000').sample(frac=0.5)])
    test = pd.concat([data.sample(frac=0.1), data.query('target > 1000').sample(frac=0.5)])
    X_train, X_test, y_train, y_test = train.drop(['target', 'isExp'], axis = 'columns'), test.drop(['target', 'isExp'], axis = 'columns'), train.isExp, test.isExp
    treeModel.fit(X_train, y_train)
    print(f1_score(y_test, treeModel.predict(X_test)))
    print(f1_score(y_train, treeModel.predict(X_train)))
    perm = PermutationImportance(treeModel).fit(X_test, y_test)
    display(eli5.show_weights(perm, feature_names = X.columns.tolist()))
    # plot_tree(treeModel)
    treeGraph = export_graphviz(treeModel, out_file=None, feature_names=X.columns)
    graph = graphviz.Source(treeGraph)
    return graph
plotTree2(trainPrep3)

In [None]:
expensive = trainPrep3.query('isExp == 1')
# expensive divide 0.5-0.5
expensiveSelector = np.random.randint(2, size=len(expensive)).astype('bool')
# cheap divide 0.25 - 0.75
cheap = trainPrep3.query('isExp == 0')
cheapSelector = np.random.choice(2, size=len(cheap), p=[0.25, 0.75]).astype('bool')

trainExpensive = expensive.loc[expensiveSelector, :]
testExpensive = expensive.loc[~expensiveSelector, :]
trainCheap = cheap.loc[cheapSelector, :]
testCheap = cheap.loc[~cheapSelector, :]

In [None]:
from sklearn.ensemble import RandomForestClassifier
# splitter = DecisionTreeClassifier(min_samples_leaf = 4, max_depth = 5)
splitter = RandomForestClassifier(n_estimators = 160)
trainAll = pd.concat([trainExpensive, trainCheap])
testAll = pd.concat([testExpensive, testCheap])
splitter.fit(trainAll.drop(['isExp', 'target'], axis = 'columns'), trainAll.isExp)
print(f1_score(testAll.isExp, splitter.predict(testAll.drop(['isExp', 'target'], axis = 'columns'))))
linRegExp = LinearRegression()
linRegChe = LinearRegression()
linRegExp.fit(trainExpensive.drop(['isExp', 'target'], axis = 'columns'), trainExpensive.target)
print(mean_absolute_error(testExpensive.target, linRegExp.predict(testExpensive.drop(['isExp', 'target'], axis = 'columns'))))
linRegChe.fit(trainCheap.drop(['isExp', 'target'], axis = 'columns'), trainCheap.target)
print(mean_absolute_error(testCheap.target, linRegExp.predict(testCheap.drop(['isExp', 'target'], axis = 'columns'))))


Time to give up.

I'm struggling with the fact that there are some houses in the dataset which have very high target value yet seem hard to separate.
Linear models in particular suffer from it, as sometimes the predicted values go really crazy.

The last attempt was to train a decision tree first to do that, and that have linear regression classifiers on each of the subset.
But that produced pretty ashaming scores.

Overall, nothing seems to realistically beat just a Decision Tree.