In [None]:
## EXPLORATORY DATA ANALYSIS

In [None]:
# Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
pd.pandas.set_option('display.max_columns', None)

In [None]:
dataset = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
print(dataset.shape)

In [None]:
dataset.head()

In [None]:
# Finding the % of missing values (NA or NULL) in each column, upto 4 decimal places

In [None]:
features_with_na = [features for features in dataset.columns if dataset[features].isnull().sum()>1]

for feature in features_with_na:
    print(feature, np.round(dataset[feature].isnull().mean(),4), "% missing values")

In [None]:
# For each feature, bar plot (feature vs SalePrice) where NULL values are at 1 and the remaining are at 0
# This will help to see in which feature there are more NULL values as compared to non-NULL values

In [None]:
for feature in features_with_na:
    data = dataset.copy()
    
    data[feature] = np.where(data[feature].isnull(),1,0)
    
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.title(feature)
    plt.show()

In [None]:
print("ID of houses: {} " . format(len(dataset.Id)))

In [None]:
# Finding total number of numerical features

In [None]:
numerical_features = [feature for feature in dataset.columns if dataset[feature].dtypes != 'object']

print("No of numerical variables: ", len(numerical_features))

In [None]:
dataset[numerical_features].head()

In [None]:
# In numerical features, finding year features

In [None]:
year_feature = [feature for feature in numerical_features if "Yr" in feature or "Year" in feature]
year_feature

In [None]:
for feature in year_feature:
    print(feature, dataset[feature].unique())

In [None]:
# Graph of year sold vs median house price. This shows the plot of the median house price for each year in the year sold feature

In [None]:
dataset.groupby("YrSold")["SalePrice"].median().plot()
plt.xlabel("Year Sold")
plt.ylabel("Median House Price")
plt.title("House Price vs Year Sold")

In [None]:
# We observe that with increasing year sold, the sale price decreases
# Scatter plot to see relationship between other year features and year sold

In [None]:
for feature in year_feature:
    if feature != "YrSold":
        data = dataset.copy()
        data[feature] = data["YrSold"] - data[feature]
        
    plt.scatter(data[feature], data["SalePrice"])
    plt.xlabel(feature)
    plt.ylabel("SalePrice")
    plt.show()

In [None]:
# Calculating number of discrete features from the numerical features, excluding the ID and year feature

In [None]:
discrete_feature = [feature for feature in numerical_features if len(dataset[feature].unique())<25 and feature not in year_feature + ["Id"]]
print("Discrete Variables Count: {} ".format(len(discrete_feature)))

In [None]:
discrete_feature

In [None]:
dataset[discrete_feature].head()

In [None]:
# Relationship between discrete features and dependent feature Sale Price

In [None]:
for feature in discrete_feature:
    data = dataset.copy()
    data.groupby(feature)["SalePrice"].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel("SalePrice")
    plt.title(feature)
    plt.show()

In [None]:
# Calculating number of continuous features from the numerical features, excluding the ID and year feature

In [None]:
continuous_feature = [feature for feature in numerical_features if feature not in discrete_feature + year_feature + ["Id"]]
continuous_feature

In [None]:
print("Continuous Feature Count: {}". format(len(continuous_feature)))

In [None]:
# Histogram plot of each continuous feature

In [None]:
for feature in continuous_feature:
    data = dataset.copy()
    data[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.title(feature)
    plt.show()

In [None]:
# Log Normalization of only those features in which no value at 0

In [None]:
for feature in continuous_feature:
    data = dataset.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data["SalePrice"] = np.log(data["SalePrice"])
        plt.scatter(data[feature],data["SalePrice"])
        plt.xlabel(feature)
        plt.ylabel("SalePrice")
        plt.title(feature)
        plt.show()

In [None]:
# Outliers

In [None]:
for feature in continuous_feature:
    data = dataset.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()

In [None]:
# Categorical Features

In [None]:
categorical_features = [feature for feature in dataset.columns if data[feature].dtypes == "object"]
categorical_features

In [None]:
len(categorical_features)

In [None]:
dataset[categorical_features].head()

In [None]:
for feature in categorical_features:
    print("The feature is {} and the number of categories are {}". format(feature, len(dataset[feature].unique())))

In [None]:
# Relationship between categorical features and dependent feature Sale Price

In [None]:
for feature in categorical_features:
    data = dataset.copy()
    data.groupby(feature)["SalePrice"].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel("SalePrice")
    plt.title(feature)
    plt.show()

In [None]:
## FEATURE ENGINEERING

In [None]:
# FEATURE ENGINEERING FOR TRAIN DATA

In [None]:
# To prevent data leakage, train-test split needs to be done. In our case we already have the data split into training and testing data
# code for train-test split
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(dataset,dataset["SalePrice"],test_size=0.1,random_state=0)

In [None]:
# Missing Values

# Missing Categorical Values

In [None]:
features_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes == "object"]

for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(dataset[feature].isnull().mean(),4)))

In [None]:
# Replacing NAN values with new labels

In [None]:
def replace_cat_features(dataset,features_nan):
    data = dataset.copy()
    data[features_nan]=data[features_nan].fillna("Missing")
    return data

dataset = replace_cat_features(dataset,features_nan)
dataset[features_nan].isnull().sum()

In [None]:
dataset.head()

In [None]:
# Missing Numerical Values

In [None]:
numerical_with_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes != "object"]

for feature in numerical_with_nan:
    print("{}: {}% missing values".format(feature,np.round(dataset[feature].isnull().mean(),4)))

In [None]:
# Replacing NULL numerical values. As outliers exist, replacing NAN values with median

In [None]:
for feature in numerical_with_nan:
    median_value=dataset[feature].median()
    
    dataset[feature+"nan"]=np.where(dataset[feature].isnull(),1,0)
    dataset[feature].fillna(median_value,inplace=True)
    
dataset[numerical_with_nan].isnull().sum()

In [None]:
dataset.head(50)

In [None]:
# Temporal Variables (Date-time variables)
# Instead of the year value, we try to find number of years, by finding difference between the year sold and the year of the feature

for feature in ["YearBuilt","YearRemodAdd","GarageYrBlt"]:
    
    dataset[feature] = dataset["YrSold"]-dataset[feature]
    
dataset.head()

In [None]:
dataset[["YearBuilt","YearRemodAdd","GarageYrBlt"]].head()

In [None]:
# Numerical Variables are skewed, thus we perform log normal distribution

In [None]:
import numpy as np
num_features = ["LotFrontage", "LotArea", "1stFlrSF", "GrLivArea", "SalePrice"]

for feature in num_features:
    dataset[feature]=np.log(dataset[feature])
    
dataset.head()

In [None]:
# Handling Rare Categorical Features
# They are those features in which a category in the feature is less than 1% of total observations. 

In [None]:
categorical_features = [feature for feature in dataset.columns if dataset[feature].dtype=="object"]
categorical_features

In [None]:
for feature in categorical_features:
    temp=dataset.groupby(feature)["SalePrice"].count()/len(dataset)
    temp_df=temp[temp>0.01].index
    dataset[feature]=np.where(dataset[feature].isin(temp_df),dataset[feature],"Rare_var")

In [None]:
dataset.head(100)

In [None]:
for feature in categorical_features:
    labels_ordered=dataset.groupby(feature)["SalePrice"].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate (labels_ordered,0)}
    dataset[feature]=dataset[feature].map(labels_ordered)

In [None]:
dataset.head(10)

In [None]:
## FEATURE SCALING FOR TRAIN DATA

In [None]:
feature_scale = [feature for feature in dataset.columns if feature not in ["Id", "SalePrice"]]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(dataset[feature_scale])

In [None]:
scaler.transform(dataset[feature_scale])

In [None]:
# transform the train and test set, and add on the Id and SalePrice variables
train_data = pd.concat([dataset[['Id', 'SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(dataset[feature_scale]), columns=feature_scale)],
                    axis=1)

In [None]:
train_data.head()

In [None]:
# train_data.to_csv('X_train.csv',index=False)

In [None]:
# FEATURE ENGINEERING FOR TEST DATA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

pd.pandas.set_option('display.max_columns', None)

dataset = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
print(dataset.shape)

In [None]:
features_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes == "object"]

for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(dataset[feature].isnull().mean(),4)))

In [None]:
def replace_cat_features(dataset,features_nan):
    data = dataset.copy()
    data[features_nan]=data[features_nan].fillna("Missing")
    return data

dataset = replace_cat_features(dataset,features_nan)
dataset[features_nan].isnull().sum()

In [None]:
numerical_with_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes != "object"]

for feature in numerical_with_nan:
    print("{}: {}% missing values".format(feature,np.round(dataset[feature].isnull().mean(),4)))

In [None]:
for feature in numerical_with_nan:
    median_value=dataset[feature].median()
    
    dataset[feature+"nan"]=np.where(dataset[feature].isnull(),1,0)
    dataset[feature].fillna(median_value,inplace=True)
    
dataset[numerical_with_nan].isnull().sum()

In [None]:
for feature in ["YearBuilt","YearRemodAdd","GarageYrBlt"]:
    
    dataset[feature] = dataset["YrSold"]-dataset[feature]
    
dataset.head()

In [None]:
import numpy as np
num_features = ["LotFrontage", "LotArea", "1stFlrSF", "GrLivArea"]

for feature in num_features:
    dataset[feature]=np.log(dataset[feature])
    
dataset.head()

In [None]:
categorical_features = [feature for feature in dataset.columns if dataset[feature].dtype=="object"]
categorical_features

In [None]:
for feature in categorical_features:
    temp=dataset.groupby(feature)["Id"].count()/len(dataset)
    temp_df=temp[temp>0.01].index
    dataset[feature]=np.where(dataset[feature].isin(temp_df),dataset[feature],"Rare_var")

In [None]:
dataset.head(10)

In [None]:
for feature in categorical_features:
    labels_ordered=dataset.groupby(feature)["Id"].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate (labels_ordered,0)}
    dataset[feature]=dataset[feature].map(labels_ordered)

In [None]:
dataset.head(10)

In [None]:
# FEATURE SCALING FOR TEST DATA

In [None]:
feature_scale = [feature for feature in dataset.columns if feature not in ["Id"]]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(dataset[feature_scale])

In [None]:
scaler.transform(dataset[feature_scale])

In [None]:
# transform the train and test set, and add on the Id and SalePrice variables
test_data = pd.concat([dataset[['Id']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(dataset[feature_scale]), columns=feature_scale)],
                    axis=1)

In [None]:
test_data.head()

In [None]:
# test_data.to_csv('X_test.csv',index=False)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
## FEATURE SELECTION

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

## for feature slection

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [None]:
dataset=train_data

In [None]:
dataset.head()

In [None]:
## Capture the dependent feature
y_train=dataset[['SalePrice']]

In [None]:
## drop dependent feature from dataset
X_train=dataset.drop(['Id','SalePrice'],axis=1)

In [None]:
### Apply Feature Selection
# first, I specify the Lasso Regression model, and I
# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

# Then I use the selectFromModel object from sklearn, which
# will select the features which coefficients are non-zero

feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)

In [None]:
feature_sel_model.get_support()

In [None]:
# let's print the number of total and selected features

# this is how we can make a list of the selected features
selected_feat = X_train.columns[(feature_sel_model.get_support())]

# let's print some stats
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(feature_sel_model.estimator_.coef_ == 0)))

In [None]:
selected_feat

In [None]:
X_train=X_train[selected_feat]

In [None]:
X_train

In [None]:
import xgboost
classifier=xgboost.XGBRegressor()
classifier.fit(X_train,y_train)

In [None]:
X_test=test_data[selected_feat]
X_test

In [None]:
y_pred_train=classifier.predict(X_train)

In [None]:
y_pred_train

In [None]:
y_train

In [None]:
y_pred_test=classifier.predict(X_test)
y_pred_test