# Introduction

This aim of this notebook isto provide an explanation and application of different feature ranking methods, with linear models, support vector machine. ensemble learning and last but not least gradient boosting.

The contents of this notebook are as follows:
1. Loading of Dataset : Here dataset will be loaded.
1. Exploratory Data Analysis : Here I will correlation among the different columns, will visualise all the columns with different plots namely distribution plots, countplots and several pie charts.
1. Data Preprocessing: Here I will will detect and correct skewness and outliers in the dataset
1. Feature Selection: Here I will do feature selection with several methods.
1. Feature Ranking Matrix : Matrix of all the features along with the respective model scores which we can use in our ranking will be created and will be plotted


In [None]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.style.use('dark_background')
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm_notebook
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# Loading Dataset

In [None]:
dataset = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

# Exploratory Data Analysis


In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.describe().T.style.bar(
    subset=['mean'],
    color='#606ff2').background_gradient(
    subset=['std'], cmap='PuBu').background_gradient(subset=['50%'], cmap='PuBu')

In [None]:
dataset.info()

In [None]:
dataset.isnull().values.any()

## Finding Correlation among the variables

In [None]:
corr_data = dataset.drop(['id','date','price'], axis = 1)
plt.figure(figsize=(20, 17))
matrix = np.triu(corr_data.corr())
sns.heatmap(corr_data.corr(), annot=True,
            linewidth=.8, mask=matrix, cmap="rocket");

## Visualising Numerical Data

In [None]:
plot_data = dataset.drop(['id', 'date','price'], axis=1)

## Frequency Plots

In [None]:
fig = plt.figure(figsize=(20, 20))
for i in tqdm_notebook(range(len(plot_data.columns)), desc = 'Generating Frequency Plots'):
    fig.add_subplot(np.ceil(len(plot_data.columns)/5), 5, i+1)
    plot_data.iloc[:, i].hist(bins=20)
    plt.title(plot_data.columns[i])
    fig.tight_layout(pad=3.0)
plt.show();

## Distribution Plots

In [None]:
plot_data = dataset.drop(['id', 'date'], axis=1)

In [None]:
def distributionPlot(dataset):
    """
    This function will create distribution plot for the dataset provided.
    """
    fig = plt.figure(figsize=(20, 20))
    for i in tqdm_notebook(range(len(dataset.columns)), desc = 'Generating Distribution Plots'):
        fig.add_subplot(np.ceil(len(dataset.columns)/5), 5, i+1)
        sns.distplot(
            dataset.iloc[:, i], color="lightcoral", rug=True)
        fig.tight_layout(pad=3.0)

In [None]:
distributionPlot(plot_data)

## Pie Charts

In [None]:
def pieChartPlotter(dataset, columnName):
    """
    This function will take dataset and column as input and plot pie chart of the distribution within that column.
    """
    values = dataset[columnName].value_counts()
    labels = dataset[columnName].unique()
    pie, ax = plt.subplots(figsize=[10, 6])
    patches, texts, autotexts = ax.pie(
        values, labels=labels, autopct='%1.2f%%', 
        shadow=True, pctdistance=.5, 
        explode=[0.06]*dataset[columnName].unique()
    )
    plt.legend(patches, labels, loc="best")
    plt.title(columnName, color='white', fontsize=14)
    plt.setp(texts, color='white', fontsize=20)
    plt.setp(autotexts, size=10, color='white')
    autotexts[1].set_color('white')
    plt.axis('equal')
    plt.tight_layout()
    plt.show()

In [None]:
pieChartPlotter(dataset,'waterfront')
pieChartPlotter(dataset, 'floors')

## Count Plots

In [None]:
def countPlotter(dataset):
    """
    This function will take dataset and will create a countplot for each column in the dataset with price column
    """
    fig = plt.figure(figsize=(20, 20))
    for i in tqdm_notebook(range(len(dataset.columns)), desc = 'Generating Count Plots'):
        if not dataset.columns[i] == 'price':
            fig.add_subplot(np.ceil(len(dataset.columns)/2), 2, i)
            sns.countplot(dataset[dataset.columns[i]],
                          order=dataset[dataset.columns[i]].value_counts().index)

            fig.tight_layout(pad=3.0)


In [None]:
#Dividing data for plotting
plot_data = dataset.drop(
    ['id', 'date', 'sqft_living15', 'sqft_lot15', 'lat', 'long', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode'], axis=1)

In [None]:
countPlotter(plot_data)

## group Plots

In [None]:
def groupBarPlotter(dataset):
    """
    This function will create bar plot for each column in the dataset with price column
    """
    fig = plt.figure(figsize=(20, 20))
    for i in tqdm_notebook(range(len(dataset.columns)), desc = 'Generating Group Plots'):
        if not dataset.columns[i] == 'price':
            groups = dataset.groupby(dataset.columns[i])['price'].mean()
            fig.add_subplot(np.ceil(len(dataset.columns)/2), 2, i)
            plt.xlabel('price')
            groups.plot.barh()
            fig.tight_layout(pad=3.0)

In [None]:
groupBarPlotter(plot_data)

Here we can see different parameters in comparison with price.

# Data Preprocessing

In [None]:
house = dataset.drop(['id', 'date','price'], axis=1)

In [None]:
X = house.values
y = dataset.iloc[:, 2:3].values
colnames = house.columns

In [None]:
X

In [None]:
y

In [None]:
X.shape

In [None]:
y.shape

# Feature Selection 

In [None]:
ranks = {}
def ranking(coefficients, columnNames, order=1):
    """
    This function will take coefficients of different models and will scale them and return a dictionary.
    """
    minmax = MinMaxScaler()
    if np.array(coefficients).ndim == 1:
        coefficients = np.array(coefficients).reshape(1, -1)
    coefficients = minmax.fit_transform(order*np.array(coefficients).T).T[0]
    coefficients = map(lambda x: round(x, 2), coefficients)
    return dict(zip(columnNames, coefficients))


In [None]:
ranks = {}
def featureRanker(X, y, ranking, colnames):
    """
    This function will return the ranks dictionary with coefficients of different models. 
    This function will take independent variables and dependent variable as an input. 
    Along with that a ranking function which will generate ranks.
    """
    params = {}
    models = {
        LinearRegression(normalize= True):  'lr',
        Ridge(alpha= 7):  'Ridge',
        Lasso(alpha= .05):  'Lasso',
        ElasticNet(alpha= 0.0005, l1_ratio= .9, random_state= 0):  'Elastic',
        RandomForestRegressor(n_jobs= -1,
                               n_estimators= 100, random_state= 0):  'RF',
        GradientBoostingRegressor(n_estimators= 100, random_state= 0):  'GBR',
        XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                      colsample_bytree=1, max_depth=70, random_state = 0):  'XGBR',
        LGBMRegressor(n_jobs= -1,
                       n_estimators= 100, random_state= 0):  'LGBM',
    }
    for i, model in enumerate(tqdm_notebook(models, desc = 'Training models')):
        estimator = model
        estimator.fit(X, y)
        if models[model] == 'lr':
            rfe = RFE(estimator, n_features_to_select=1)
            rfe.fit(X, y)
            ranks["RFE"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
        if not hasattr(estimator, 'coef_'):
            if not hasattr(estimator, 'dual_coef_'):
                ranks[models[model]] = ranking(
                    estimator.feature_importances_, colnames)
            elif hasattr(estimator, 'dual_coef_'):
                ranks[models[model]] = ranking(
                    np.abs(svr.dual_coef_), colnames)
        elif hasattr(estimator, 'coef_'):
            ranks[models[model]] = ranking(
                np.abs(estimator.coef_), colnames)
    return ranks


In [None]:
ranks = featureRanker(X, y, ranking, colnames)

# Creating the Feature Ranking Matrix

We combine the scores from the various methods above and output it in a matrix form :

In [None]:
r = {}
for name in colnames:
    r[name] = round(np.mean([ranks[method][name]
                             for method in ranks.keys()]), 2)

methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")

print("\t%s" % "\t".join(methods))
for name in colnames:
    print("%s\t%s" % (name, "\t".join(map(str,
                                          [ranks[method][name] for method in methods]))))

Now, with the matrix above, the numbers and layout does not seem very easy or pleasant to the eye. So lets just visualise them.

In [None]:
'''Put the mean scores into a Pandas dataframe'''

meanplot = pd.DataFrame(list(r.items()), columns=['Feature', 'Mean Ranking'])

'''Sorting the dataframe'''
meanplot = meanplot.sort_values('Mean Ranking', ascending=False)

In [None]:
sns.factorplot(x="Mean Ranking", y="Feature", data=meanplot,
               kind="bar", size=4, aspect=1.9, palette='coolwarm');

As you can see the top 3 features are : 'lat', 'waterfront', 'grade'
and bottom 3 features are: 'yr_renovated' , 'sqft_lot15', ;sqft_lot'

# I hope you guys like this notebook. I will be more than happy to hear your feedbacks and Please consider upvoting if you like my work.