# House Prices Prediction
References:
* https://www.kaggle.com/learn/machine-learning
* https://www.kaggle.com/helgejo/an-interactive-data-science-tutorial
* http://blog.kaggle.com/2016/07/21/approaching-almost-any-machine-learning-problem-abhishek-thakur/

In [None]:
######################################################################
# DEPENDENCES
######################################################################

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# # Modelling Algorithms
from sklearn.tree import DecisionTreeRegressor
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC
# from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import GradientBoostingClassifier

# # Modelling Helpers
from sklearn.metrics import mean_absolute_error
# from sklearn.preprocessing import Imputer
# from sklearn.preprocessing import Normalizer
# from sklearn.preprocessing import scale
# from sklearn.cross_validation import train_test_split
# from sklearn.cross_validation import StratifiedKFold
# from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

# Numeric attributes
numeric_attributes = ['MSSubClass','LotFrontage','LotArea','OverallQual','OverallCond',
                      'YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF2','BsmtUnfSF',
                      'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                      'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr',
                      'KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars',
                      'GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch',
                      'ScreenPorch','PoolArea','MiscVal','MoSold','YrSold']

In [None]:
######################################################################
# HELPER FUNCTIONS
######################################################################

def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure() # figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title(var_name)
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_pairwise_relationships(df, x_vars, y_vars):
    sns.pairplot(data, y_vars=y_vars,x_vars=x_vars)
    
def plot_triple_relationships(df, x, y, groupby, bars=True):
    if bars:
        sns.barplot(x=x, y=y, hue=groupby, data=df)
    else:
        sns.pointplot(x=x, y=y, hue=groupby, data=df);
    
def plot_distribution( df , target , var , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=var , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , target , shade= True )
    facet.set( xlim=( 0 , df[ target ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_regression(df,x_var,y_var):
    sns.jointplot(x=x_var, y=y_var, data=df, kind="reg");
    #sns.lmplot(x=x_var, y=y_var, data=df, hue=groupby);
    
def plot_residuals(df,x_var,y_var):
    sns.residplot(x=x_var, y=y_var, data=df, scatter_kws={"s": 80});
    
def plot_prediction(X_test, y_test, y_prediction, y_variable, x_groupby_y):
    df_test = pd.concat([X_test, y_test],axis=1)
    df_prediction = pd.DataFrame(data=y_prediction,index=y_test.index,columns=[y_variable])
    df_prediction = pd.concat([X_test, df_prediction],axis=1)
    data_all = pd.concat(dict(data=df_test[x_groupby_y], model=df_prediction[x_groupby_y]),names=["kind"]).reset_index()
    sns.factorplot(x_groupby_y[0], x_groupby_y[2], "kind", data=data_all, col=x_groupby_y[1],kind="point", linestyles=["-", "--"], markers=["o", "D"])

def plot_correlation_map( df , annot=True):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = annot, 
        annot_kws = { 'fontsize' : 12 }
    )

def plot_correlation_cluster(df):
    corr = df.corr()
    sns.clustermap(data.corr())
    
def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))
    
def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    preds_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, preds_val)
    return(mae)

def filter_numeric_predictors(data_1, y_variable, data_2=None):
    data_1.dropna(axis=1,how='any',inplace=True)
    if data_2 is not None:
        data_2.dropna(axis=1,how='any',inplace=True)
        attributes = set(data_train.columns).intersection(set(data_test.columns))
        attributes.discard(y_variable)
    else:
        attributes = data.columns
        attributes.drop(y_variable)
    predictors = numeric_attributes.copy()
    print('{} predictors'.format(len(predictors)))
    to_delete = [p for p in predictors if p not in attributes]
    for p in to_delete:
        predictors.remove(p)
    print('{} valid predictors'.format(len(predictors)))
    return data_1,predictors,data_2

In [None]:
######################################################################
# LOADING DATA
######################################################################

main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)
print('hello world!')