In [None]:
#Import the required packages

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
import warnings
warnings.filterwarnings("ignore")

In [None]:
#get the glimpse of data 
data = pd.read_csv('../input/iowa-house-prices/train.csv')

In [None]:
test = pd.read_csv('../input/iowa-house-prices/test.csv')

In [None]:
# --------------------See the distribution of SalePrice, LotArea, GrLivArea, MasVnrArea -------------------------

num_features = ["SalePrice", "LotArea", "GrLivArea", "MasVnrArea"]
fig = plt.subplots(figsize  = (18, 15))
count = 1
for i, feature in enumerate(num_features):
    plt.subplot(2, 2, i+1)
    feature = num_features[i]
    sns.distplot(data[feature], bins = 30, kde = True, color = "y")
    count += 1

#plt.show()

In [None]:
# -------------------------Boxplot showing the saleprice based on the sale type --------------------

sns.set(rc = {'figure.figsize' : (11, 13)})
sns.boxplot(data["SaleType"], data["SalePrice"], palette = "husl")

In [None]:
#Find out how many columns have missing values more than 50%
#Also list out how of many are categorical and numerical columns 

def missing_value_info(data):
    
    #get the columns and the percentage of missing observations in them
    missing_values = pd.DataFrame((data.isnull().sum() / data.shape[0]) * 100).reset_index()
    missing_values.columns = ["features", "missing_percent"]
    missing_gr_50 = list(missing_values.loc[missing_values["missing_percent"] > 50, "features"])
    
    #print the column names with more than 50% of missing observations
    print("The columns with more than 50% of missing values are ", missing_gr_50)
    
    missing_val_cols = list(missing_values.loc[missing_values["missing_percent"]!=0, "features"])
    
    #Print the total number of columns with missing observations
    print("The total number of columns with missing observations are", len(missing_val_cols))
    


In [None]:
missing_value_info(data)

In [None]:
data.drop(["Alley", "Fence", "MiscFeature"], axis = 1, inplace = True)
test.drop(["Alley", "Fence", "MiscFeature"], axis = 1, inplace = True)

In [None]:
#column types and description
def Cat_col(data):
    #Extract categorical columns from the data i.e. columns of object or categorical data type
    cat_cols = data.select_dtypes(include = object).columns
    
    #Get the value counts of all object columns
    for col in cat_cols:
        print("="*40)
        print("The unique values in {} are" .format(col))
        
        #Print the unique values and its counts in all categorical columns
        print(data[col].value_counts())


In [None]:
# -------------------get the columns related to garage -----------------------------
garage_columns = data.columns[data.columns.str.startswith("Garage")]

# ------------------Missing values in garage columns ----------------------------
# ------------Checking if the garage columns have any missing values other than null values referring to there was no garage in the house -------

missing_obs_garage = data[garage_columns].isnull().sum() / data.shape[0]
#print("The percentage of missing observations in garage columns are \n ", missing_obs_garage )

In [None]:
data.loc[data["BsmtCond"].isnull(), ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtFullBath',
       'BsmtHalfBath']] = data.loc[data["BsmtCond"].isnull(), ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtFullBath',
'BsmtHalfBath']].fillna(0)

In [None]:
from collections import defaultdict
quality_labels = {"Ex" : 1,
                 "Gd": 2,
                  "TA" : 3,
                 "Fa": 4,
                  "Po": 5}

BsmtExposure_labels = {"No": 4,
                      "Av": 2, 
                      "Gd": 1,
                      "Mn": 3,
                      }

BsmtFinType1_2  = {"GLQ" : 1,
       "ALQ" : 2,
       "BLQ" : 3,
       "Rec" : 4,
       "LwQ" : 5,
       "Unf" : 6}

quality_labels = defaultdict(lambda:0, quality_labels)
BsmtExposure_labels = defaultdict(lambda:0, BsmtExposure_labels)
BsmtFinType_labels = defaultdict(lambda:0, BsmtFinType1_2)


In [None]:
#data["ExterQual"] = data["ExterQual"].map(Quality_labels)
#, "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "KitchenQual", "FireplaceQu", "GarageQual",
#"GarageCond"


data["BsmtQual"] = data["BsmtQual"].map(quality_labels)
data["ExterCond"] = data["ExterCond"].map(quality_labels)
data["BsmtCond"] = data["BsmtCond"].map(quality_labels)
data["BsmtExposure"] = data["BsmtExposure"].map(BsmtExposure_labels)
data["BsmtFinType1"] = data["BsmtFinType1"].map(BsmtFinType_labels)
data["BsmtFinType2"] = data["BsmtFinType2"].map(BsmtFinType_labels)
data["HeatingQC"] = data["HeatingQC"].map(quality_labels)
data["KitchenQual"] = data["KitchenQual"].map(quality_labels)
data["GarageQual"] = data["GarageQual"].map(quality_labels)
data["GarageCond"] = data["GarageCond"].map(quality_labels)

In [None]:
# -------------------Encoding categorical features----------------------

def encode(data, features, methods = "LabelEncoding"):
    if methods == "LabelEncoding":
        label_encoder = LabelEncoder()
        for feature in features:
            data[feature] = label_encoder.fit_transform(data[feature].astype(str))       
    elif methods == "OneHotEncoding":
        dummy_features = pd.get_dummies(data, features, drop_first = True)
        data = pd.concat([data, dummy_features], axis = 1)
    return data
            


In [None]:
set(data.columns).difference(set(test.columns))

In [None]:
train_data = data.drop(["Id", "SalePrice"], axis = 1)
test_data = data.drop(["Id"], axis = 1)

In [None]:
y = data["SalePrice"].values
x_train, x_test, y_train, y_test = train_test_split(train_data, y, shuffle = True, random_state = 3)

In [None]:
pd.DataFrame(x_train).columns

In [None]:
#Feature Engineering

#Standardize data

def preprocessing(x_train):
    
    x_train = pd.DataFrame(x_train)
   
    # -------------------Standardize the skewed numerical variables -----------------
    standardize = StandardScaler()
  
    x_train[["LotArea", "GrLivArea", "MasVnrArea"]] = standardize.fit_transform(x_train[["LotArea", "GrLivArea", "MasVnrArea"]])
    
    # -----------------save all the object type columns in a variable called features and call the encode function -------------
    features = x_train.select_dtypes(include = object).columns
    x_train = encode(data = x_train, features = features, methods = "LabelEncoding")
    
    #Impute Lot Frontage column

    x_train["LotFrontage"] = x_train["LotFrontage"].fillna(x_train["LotFrontage"].mean())

    # --------------------Missing value imputations ---------------------------------------------------------
    
    x_train["BsmtExposure"] = x_train["BsmtExposure"].fillna(data["BsmtExposure"].value_counts().index[0])
    x_train["BsmtFinType2"] = x_train["BsmtFinType2"].fillna(data["BsmtFinType2"].value_counts().index[0])
    
    # ----------------------Fill the missing values in garage as "No Garage" --------------------------

    x_train[garage_columns] = x_train[garage_columns].fillna(0)
    #print(data[garage_columns].isnull().sum())


    x_train["FireplaceQu"] = x_train["FireplaceQu"].fillna(data["FireplaceQu"].value_counts().index[0])

    # -------------------fill the missing PoolQC column -----------------------------

    x_train['PoolQC'] = x_train["PoolQC"].fillna("NoPool")
    
    return x_train


In [None]:
x_train = preprocessing(x_train)
x_test = preprocessing(x_test)
test = preprocessing(test)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
parameter_grid = {"n_estimators" : [300, 500, 800, 1000],
                  "num_leaves": list(range(20, 150)),
                 "subsample" : np.linspace(0, 1, 5),
                 "colsample_bytree": [0.5, 0.6, 0.7],
                 "reg_alpha": [0.01, 0.05, 0.1, 0.5],
                 "reg_lambda": [0.01, 0.05, 0.1, 0.5]}

In [None]:
clf = lgb.LGBMRegressor(learning_rate = 0.5, metric = "mean_squared_error")
randomsearch_cv = RandomizedSearchCV(clf, parameter_grid, cv = 5)

In [None]:
randomsearch_cv.fit(x_train, y_train)

In [None]:
print("Best Parameters: ", randomsearch_cv.best_params_)
print("Best Estimators: ",randomsearch_cv.best_estimator_)
print("Best Score: ", randomsearch_cv.best_score_)

In [None]:

import lightgbm as lgb

gbm = lgb.LGBMRegressor(colsample_bytree=0.5, learning_rate=0.4,
              metric='mean_squared_error', n_estimators=800, num_leaves=24,
              reg_alpha=0.01, reg_lambda=0.5)
gbm.fit(x_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
y_pred = gbm.predict(x_test)
print("The mean absolute error is", mean_absolute_error(y_pred, y_test))

In [None]:
feature_importance = pd.DataFrame({"Features" : gbm.feature_name_, "Feature_Importance":gbm.feature_importances_}).sort_values(by = "Feature_Importance", ascending = False)
feature_importance = feature_importance.set_index("Features")

In [None]:
plt.figure(figsize = (10, 6))
plt.barh(list(reversed(feature_importance.index[:15])), list(reversed(feature_importance["Feature_Importance"].head(15))))
plt.show()

In [None]:
#test = test.drop(['Alley', 'Fence', 'Id', 'MiscFeature'], axis = 1)
predict_test = pd.read_csv('../input/iowa-house-prices/test.csv')
SalePrice = gbm.predict(test.drop(["Id"], axis = 1))
output = pd.DataFrame({"Id" : predict_test["Id"], "SalePrice":SalePrice})
output.to_csv("submission_final.csv", index = False)