In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline

from sklearn.linear_model import (LinearRegression, Ridge, Lasso, RandomizedLasso)
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn import metrics

## Intro

I want to go ahead and mention this first. This kernel was heavily based off Anisotropic's work on feature selection. I would definitely check out his project if you want to really learn more about feature selection. My kernel is basically a more simple version.

In [None]:

df = pd.read_csv("../input/kc_house_data.csv")

In [None]:

numerical_cols = df[['price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']]

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(16, 6))
sns.heatmap(df.corr(), annot=True, fmt='.0g' , cmap='coolwarm')

In [None]:
order=df.groupby('grade').mean().sort_values(by='sqft_lot', ascending=True).index.values

sns.barplot(x='grade', y='sqft_lot', data=df, order=order)

In [None]:
df['total_sqft'] = df['sqft_living'] + df['sqft_lot']

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
sns.set_style('whitegrid')

plt.figure(figsize=(16, 4))

sns.lineplot(x='date', y='price', data=df)

plt.xticks(rotation = 90)

plt.show()

In [None]:
plt.figure(figsize=(16, 4))

sns.lineplot(x='yr_built', y='condition', data=df)

In [None]:
sns.lmplot(x='condition', y='price', data=df)

In [None]:
plt.figure(figsize=(16, 4))

sns.boxplot(x='bedrooms', y='price', data=df)

In [None]:
sns.barplot(x='waterfront', y='price', data=df)

In [None]:
plt.figure(figsize=(8, 4))

sns.barplot(x='view', y='price', data=df)

In [None]:
df['basement'] = df['sqft_basement'] > 0

In [None]:
plt.figure(figsize=(16, 4))

sns.lineplot(x='yr_renovated', y='condition', data=df[df['yr_renovated'] > 0])


In [None]:
plt.figure(figsize=(16, 4))

sns.lineplot(x='yr_built', y='price', data=df)


In [None]:
sns.barplot(x='basement', y='price', data=df)

In [None]:
sns.lmplot(x='sqft_living', y='price', data=df)

In [None]:
sns.lmplot(x='lat', y='price', data=df)

In [None]:
y = df['price']
x = df[['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
        'sqft_living15', 'sqft_lot15']]
col_names = x.columns.values

In [None]:
lr = LinearRegression(normalize=True)
lr.fit(x, y)
lr_coef = lr.coef_

In [None]:


def scale_coef (coef, model_name):

    minmax = MinMaxScaler()

    coef = minmax.fit_transform(np.array([np.abs(coef)]).T).T[0]

    coef = pd.DataFrame(data=coef, columns=[model_name], index=col_names)
    
    return round(coef, 2)

In [None]:
l = scale_coef(lr_coef, 'lr')

In [None]:
l

In [None]:

ridge = Ridge(alpha = 7)
ridge.fit(x, y)
r = scale_coef(ridge.coef_, 'ridge')


lasso = Lasso(alpha=.05)
lasso.fit(x, y)
ls = scale_coef(lasso.coef_, 'lasso')

In [None]:
random_forest = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=3)
random_forest.fit(x, y)
rf = scale_coef(random_forest.feature_importances_, 'random forest');

In [None]:
fs = pd.concat([l, r, ls, rf], axis=1)

In [None]:
fs['mean'] = (fs['lr'] + fs['ridge'] + fs['lasso'] + fs['random forest']) / 4

In [None]:
fs['mean'] = round(fs['mean'], 2)

In [None]:
order = fs.sort_values(by='mean', ascending=False).index.values

In [None]:
fs

In [None]:
plt.figure(figsize=(16, 4))

sns.barplot(y='index', x='mean', data=fs.reset_index(), order=order)

## Thank you!