In [245]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

In [246]:
train = pd.read_csv('../data/house_train.csv')
test = pd.read_csv('../data/house_test.csv')

In [213]:
# Separating the categorical and continuous columns from the training dataset

def cat_con_df(train):
    
    cat = []
    con = []

    for i in train.columns:
        if (train[i].dtypes=='object'):
            cat.append(i)
        else:
            con.append(i)
    con.remove('Id')
    
    return cat, con

In [214]:
cat, con = cat_con_df(train)

In [216]:
# Check and fill the top 5 missing columns with '0' in both training and testing dataset

def fill_top_missing_values(train, test):

    miss1 = (train.isna().sum()/train.shape[0])*100
    miss1 = pd.DataFrame(miss1, columns=['count'])
    miss1 = miss1.sort_values(by='count', ascending=False)

    miss2 = (test.isna().sum()/test.shape[0])*100
    miss2 = pd.DataFrame(miss2, columns=['count'])
    miss2 = miss2.sort_values(by='count', ascending=False)

    print(f"Top 10 missing features {miss1[:10]} from training dataset")
    print(f"Top 10 missing features {miss2[:10]} from testing dataset")

    train_miss = (miss1[:6].index).values
    test_miss = (miss2[:6].index).values

    for i in train_miss:
        train[i].fillna("0", inplace=True)

    for i in test_miss:
        test[i].fillna("0", inplace=True)
        
    return train, test

In [217]:
# Removing the rest of the missing columns from both training and testing dataset

def fill_missing_values(train, test):

    si1 = SimpleImputer(strategy='mean')
    si2 = SimpleImputer(strategy='most_frequent')

    A = pd.DataFrame(si1.fit_transform(train[con]), columns=con)
    B = pd.DataFrame(si2.fit_transform(train[cat]), columns=cat)

    train_new = A.join(B)
    con.remove('SalePrice')

    A = pd.DataFrame(si1.fit_transform(test[con]), columns=con)
    B = pd.DataFrame(si2.fit_transform(test[cat]), columns=cat)

    test_new = A.join(B)
    con.append('SalePrice')
    
    return train_new, test_new

In [218]:
train, test = fill_top_missing_values(train, test)

Top 10 missing features                  count
PoolQC       99.520548
MiscFeature  96.301370
Alley        93.767123
Fence        80.753425
MasVnrType   59.726027
FireplaceQu  47.260274
LotFrontage  17.739726
GarageYrBlt   5.547945
GarageCond    5.547945
GarageType    5.547945 from training dataset
Top 10 missing features                   count
PoolQC        99.794380
MiscFeature   96.504455
Alley         92.666210
Fence         80.123372
MasVnrType    61.274846
FireplaceQu   50.034270
LotFrontage   15.558602
GarageYrBlt    5.346127
GarageQual     5.346127
GarageFinish   5.346127 from testing dataset


In [219]:
train_new, test_new = fill_missing_values(train, test)

In [220]:
def check_skew(data):
    
    con.remove('SalePrice')
    skewed = data[con].apply(lambda x: skew(x))
    skewed = skewed[skewed > 0.75]
    skewed = skewed.index

    data[skewed] = np.log1p(data[skewed])
    con.append('SalePrice')
    
    return data

In [221]:
train_new = check_skew(train_new)
test_new = check_skew(test_new)

In [222]:
def scaling(train_new, test_new):
    
    ss = StandardScaler()

    con.remove('SalePrice')
    train_new[con] = ss.fit_transform(train_new[con])
    test_new[con] = ss.transform(test_new[con])

    con.append('SalePrice')
    
    return train_new, test_new

In [223]:
train_new, test_new = scaling(train_new, test_new)

In [224]:
# Removing the outliers from the below columns as they have more number of outliers

def handle_outliers(train_new):
    
    val1 = ['BsmtUnfSF', 'TotalBsmtSF', 'KitchenAbvGr', 'ScreenPorch']

    for i in val1:
        Q1 = train_new[i].quantile(0.05)
        Q3 = train_new[i].quantile(0.95)
        IQR = Q3 - Q1
        train_new = train_new[(train_new[i] >= Q1 - 1.5*IQR) & (train_new[i] <= Q3 + 1.5*IQR)]
        
    return train_new

In [225]:
train_new = handle_outliers(train_new)

In [226]:
# perform one hot encoding so as to handle unseen values

def encode_train_test(train_new, test_new):
    
    le = LabelEncoder()

    for i in cat:
        train_new[i] = le.fit_transform(train_new[i])
        test_new[i] = le.fit_transform(test_new[i])
        
    return train_new, test_new

In [227]:
train_new, test_new = encode_train_test(train_new, test_new)

In [229]:
def train_val_split(train_new):

    X = train_new.drop(labels='SalePrice', axis=1)
    Y = train_new['SalePrice']

    xtrain, xval, ytrain, yval = train_test_split(X, Y, test_size=0.2, random_state=40)
    print(xtrain.shape)
    print(xval.shape)
    
    return xtrain, xval, ytrain, yval

In [230]:
xtrain, xval, ytrain, yval = train_val_split(train_new)

(1096, 79)
(275, 79)


In [232]:
def model_evaluate(xval, yval, ypred, model_type):
    
    print(f"model type is {model_type}")
    mean_cross_val_score = np.abs(np.mean(cross_val_score(dtr, X, Y, scoring='neg_mean_absolute_error', cv=5)))
    print(f"mean of cross validation score is {mean_cross_val_score}")
    print(f"mean absolute score for {model_type} is {mean_absolute_error(ypred, yval)}")
    print(f"R2 score is {r2_score(ypred, yval)}")
    r = r2_score(ypred, yval)
    n = xval.shape[0]
    p = xval.shape[1]
    adjr = 1-(1-r)*(n-1)/(n-p-1)
    print(f"Adjusted R2 score is {adjr}")

In [243]:
def model_train(xtrain, ytrain, xval, yval, model_type='RFR'):

    if model_type == "LIN":
        dtr = LinearRegression()
        model = lr.fit(xtrain, ytrain)
    elif model_type == "DTR":
        dtr = DecisionTreeRegressor(random_state=30, criterion='absolute_error', max_depth=10)
        model = dtr.fit(xtrain, ytrain)
    elif model_type == 'RFR':
        rfr = RandomForestRegressor(random_state=30, criterion='absolute_error', max_depth=10, n_estimators=20)
        model = rfr.fit(xtrain, ytrain)

    ypred = model.predict(xval)
    model_evaluate(xval, yval, ypred, model_type)

In [244]:
model_train(xtrain, ytrain, xval, yval)

model type is RFR
mean of cross validation score is 25977.14008095554
mean absolute score for RFR is 19455.055545454547
R2 score is 0.7710185772892303
Adjusted R2 score is 0.6782517444987133


In [238]:
def top_feature(model, X):
    
    imp = pd.DataFrame()
    imp['col'] = X.columns
    imp['importance'] = model.feature_importances_
    imp = imp.sort_values(by='importance', ascending=False)
    print(f"Top 10 important features are: {imp[:10]}")

In [239]:
top_feature(model, X)

Top 10 important features are:             col  importance
3   OverallQual    0.314746
15    GrLivArea    0.112345
26   GarageArea    0.068203
12     1stFlrSF    0.062964
11  TotalBsmtSF    0.058216
18     FullBath    0.042226
57     BsmtQual    0.027754
8    BsmtFinSF1    0.027463
5     YearBuilt    0.025574
25   GarageCars    0.019152


In [240]:
def add_predicted_col(model, test_new):
    
    ypred_main = model.predict(test_new)
    test_new['SalePrice'] = ypred_main
    
    return test_new

In [241]:
add_predicted_col(model, test_new)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,SalePrice
0,-1.125202,240.867584,0.482944,-0.795151,0.381743,-0.340077,-1.156380,-0.815959,0.642211,2.342933,...,2,3,4,2,0,3,0,8,4,121957.300
1,-1.125202,244.045974,0.879380,-0.071836,0.381743,-0.439440,-1.301740,0.968010,0.868926,-0.355342,...,2,3,4,2,0,0,1,8,4,156333.750
2,0.424462,221.797249,0.819235,-0.795151,-0.517200,0.852269,0.636400,-0.815959,0.817388,-0.355342,...,0,3,4,2,0,3,0,8,4,183360.000
3,0.424462,234.510806,0.188077,-0.071836,0.381743,0.885390,0.636400,0.341775,0.726234,-0.355342,...,0,3,4,2,0,0,0,8,4,179850.825
4,1.419810,123.267180,-1.145753,1.374795,-0.517200,0.686666,0.345679,-0.815959,0.450086,-0.355342,...,1,3,4,2,0,0,0,8,4,213172.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1.834876,53.342616,-2.981622,-1.518467,1.280685,-0.041991,-0.720298,-0.815959,-1.414140,-0.355342,...,2,3,4,2,0,0,0,8,4,86030.000
1455,1.834876,53.342616,-3.024008,-1.518467,-0.517200,-0.041991,-0.720298,-0.815959,0.435856,-0.355342,...,2,3,4,2,0,0,0,8,0,91760.000
1456,-1.125202,495.138728,1.532451,-0.795151,1.280685,-0.373198,0.539493,-0.815959,0.963203,-0.355342,...,2,3,4,2,0,0,0,8,0,148936.250
1457,0.923612,183.656577,0.275768,-0.795151,-0.517200,0.686666,0.345679,-0.815959,0.532698,-0.355342,...,2,3,4,2,0,3,3,8,4,114187.500
