In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('../input/boston-housing-dataset/HousingData.csv')

In [None]:
data.shape

In [None]:
data.head(10)

In [None]:
col_names = ['CrimeRate', 'ZonedRatio', 'IndusRatio', 
             'AlongRiver', 'NO2Level', 'RoomsPerHouse', 
             'OldHomeRatio', 'DisFromCenter', 'RoadAccessIndex', 
             'PropTaxRate', 'PupilTeacherRatio', 'Black', 'LowestPopulation', 'MedianHomeValue']

In [None]:
data.columns = col_names
data.head(10)

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data = data.replace('Na', np.nan)

In [None]:
data.isnull().sum().sort_values(ascending=False)

In [None]:
data = data.fillna(data.mean())

In [None]:
data.isnull().sum()

In [None]:
data.describe()

**Selecting Feature and Targer**

In [None]:
features = data.drop('MedianHomeValue', axis=1)
target = data['MedianHomeValue']
features.shape, target.shape

#### Visualize the feature correlations

In [None]:
from yellowbrick.target import FeatureCorrelation
vis = FeatureCorrelation(lables = features.columns)
vis.fit(features, target)
vis.poof()

### Importing Univariate feature selection class and methods

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [None]:
select_univariate = SelectKBest(f_regression, k=5).fit(features, target)

In [None]:
feature_mask = select_univariate.get_support()
feature_mask

### 1. Univariate features

In [None]:
features.columns[feature_mask]

In [None]:
select_univariate.scores_

In [None]:
pd.DataFrame({'FeatureName': features.columns, 
              'Score': select_univariate.scores_}).sort_values(by='Score', 
                                                               ascending=False)

In [None]:
uni_df = pd.DataFrame({'Univariate Method': features.columns[feature_mask]})
uni_df

### 2. Import RFE(Recursive Feature Elimination) and LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
linear_regression = LinearRegression()

In [None]:
rfe = RFE(estimator = linear_regression, n_features_to_select = 5, step = 1)

In [None]:
rfe.fit(features,target)

### Selected features, according to RFE Ranking

In [None]:
rfe_features = features.columns[rfe.support_]
rfe_features

#### View rankings of all the features

In [None]:
pd.DataFrame({'FeatureName': features.columns, 
              'Rank': rfe.ranking_}).sort_values(by='Rank')

In [None]:
rfe_df = pd.DataFrame({'RFE Method': rfe_features})
rfe_df

**Backward Selection**

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
feature_selector = SequentialFeatureSelector(LinearRegression(),  
                                             k_features=5,
                                             forward=False,
                                             scoring='neg_mean_squared_error',
                                             cv=4)

In [None]:
feature_filtered = feature_selector.fit(features, target)

### Selected Backward selecter features names

In [None]:
backward_features = list(feature_filtered.k_feature_names_)
backward_features

In [None]:
back_df = pd.DataFrame({'Backward Method': backward_features})
back_df

### Forward feature selection

In [None]:
feature_selector = SequentialFeatureSelector(LinearRegression(),  
                                             k_features=5,
                                             forward=True,
                                             scoring='neg_mean_squared_error',
                                             cv=4)

In [None]:
feature_filtered = feature_selector.fit(features, target)

In [None]:
forward_features = list(feature_filtered.k_feature_names_)
forward_features

In [None]:
forw_df = pd.DataFrame({'Forward Method': forward_features})
forw_df

## Regularization technique for feature selection 

In [None]:
from sklearn.linear_model import  Lasso

In [None]:
lasso = Lasso(alpha=1.0)
lasso.fit(features, target)

In [None]:
lasso_coef = pd.DataFrame({'Feature': features.columns, 
                           'LassoCoef': lasso.coef_}).sort_values(by = 'LassoCoef',
                                                                 ascending =False)
lasso_coef

In [None]:
lasso_df = lasso_coef.sort_values(by='LassoCoef', ascending=False).head(5)
lasso_df = pd.DataFrame({'Lasso Method': lasso_df['Feature'].values})
lasso_df

In [None]:
comp_selected_col_df = [uni_df, 
                        rfe_df, 
                        back_df, 
                        forw_df, 
                        lasso_df]

final_df = pd.concat(comp_selected_col_df, axis=1)
final_df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

### Make a function to find training score and testing score

In [None]:
result = []
def best_score(name, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    linear_model = LinearRegression(normalize = True).fit(X_train, y_train)
    print(name)
    print('Training score:', linear_model.score(X_train, y_train))
    y_pred = linear_model.predict(X_test)
    print('r2_score:', r2_score(y_test, y_pred))

### Univariate 

In [None]:
best_score('Univariate', 
           features[final_df['Univariate Method'].values], 
           target)

### Recursive

In [None]:
best_score('Recursive', 
           features[final_df['RFE Method'].values], 
           target)

### Backward

In [None]:
best_score('Backward', 
           features[final_df['Backward Method'].values], 
           target)

### Forward

In [None]:
best_score('Forward', 
           features[final_df['Forward Method'].values], 
           target)

### Lasso

In [None]:
best_score('Lasso', 
           features[final_df['Lasso Method'].values], 
           target)