# Feature selection 

# 1. Property data feature selection 

Selecting the features on the property that has signal and better predictor for our model. 

## Import Package 

In [1]:
import os
import zipfile
import requests
import pandas as pd

# Packages for feature selection and modeling 
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

## Fetch Data 

In [2]:
FM_1 =pd.read_csv("df_4.csv")
FM_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11361 entries, 0 to 11360
Columns: 118 entries, Unnamed: 0 to ListDate_all
dtypes: float64(64), int64(52), object(2)
memory usage: 10.2+ MB


## Load data to data frame

In [3]:
FM_1 = pd.DataFrame(FM_1)
#Dropping some features which are not predictive
FM_1=FM_1.drop(FM_1.columns[[0]], axis=1)
FM_1=FM_1.drop('CloseDate_dt_year', axis=1)
FM_1=FM_1.drop('CloseDate_dt_month', axis=1)
FM_1=FM_1.drop('CloseDate_dt_day', axis=1)


In [4]:
features= FM_1.drop('DOMP', axis=1)
labels   = FM_1['DOMP']

In [5]:
list(features)

['ML#',
 'ListPrice2',
 'ClosePrice2',
 'Bedrooms',
 'BathsFull',
 'BathsHalf',
 'Levels',
 'Fireplaces',
 'BasementY/N',
 'Acres',
 'YearBuilt',
 'TotalTaxes2',
 'TaxTotalLivingArea',
 'Zip',
 'SaleCount',
 'MedianValuePerSqft',
 'PctOfHomesDecreasingInValues',
 'PctOfHomesIncreasingInValues',
 'Turnover',
 'Zhvi',
 'PriceIndex',
 'FreddieMac15yr',
 'FreddieMac5yrARM',
 'mimi',
 'mimiStatus',
 'PropertyCrimes',
 'ViolentCrimes',
 'Pct16andOverEmployed_2010_14',
 'AvgFamilyIncAdj_2010_14',
 'Pct25andOverWoutHS_2010_14',
 'PctFamiliesOwnChildrenFH_2010_14',
 'PctForeignBorn_2010_14',
 'PctHshldCar_2010_14',
 'PctHshldPhone_2010_14',
 'PctOwnerOccupiedHsgUnits_2010_14',
 'PctPoorChildren_2010_14',
 'PctPoorElderly_2010_14',
 'PctPoorPersons_2010_14',
 'PctUnemployed_2010_14',
 'PctVacantHsgUnitsForRent_2010_14',
 'PropertyLatitude',
 'PropertyLongitude',
 'ES_AvgStandardScore',
 'ES_IsCharter',
 'ES_IsMagnet',
 'ES_IsTitleI',
 'ES_IsTitleI.1',
 'ES_IsVirtual',
 'ES_NumFTTeachers',
 'ES_N

In [6]:
model = Lasso()
model.fit(features, labels)
print(list(zip(features, model.coef_.tolist())))

ValueError: could not convert string to float: '2012-03-27'

## Ridge Regression (L2 Regularization)

In [7]:
model = Ridge()
model.fit(features, labels)
print(list(zip(features, model.coef_.tolist())))

ValueError: could not convert string to float: '2012-03-27'

## ElasticNet

In [8]:
model = ElasticNet(l1_ratio=0.10)
model.fit(features, labels)
print(list(zip(features, model.coef_.tolist())))

ValueError: could not convert string to float: '2012-03-27'

## Transformer methods

### SelectFromModel()

In [39]:
# Lasso 
model = Lasso()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print(list(features[sfm.get_support(indices=True)]))

['ListPrice2', 'ClosePrice2', 'Bedrooms', 'BathsFull', 'Levels', 'YearBuilt', 'TotalTaxes2', 'TaxTotalLivingArea', 'Zip', 'SaleCount', 'MedianValuePerSqft', 'PctOfHomesDecreasingInValues', 'PctOfHomesIncreasingInValues', 'PriceIndex', 'mimi', 'PropertyCrimes', 'ViolentCrimes', 'AvgFamilyIncAdj_2010_14', 'PctFamiliesOwnChildrenFH_2010_14', 'PctForeignBorn_2010_14', 'PctOwnerOccupiedHsgUnits_2010_14', 'PctPoorPersons_2010_14', 'PctUnemployed_2010_14', 'count_metro_bus_km', 'count_metro_station_km', 'count_public_school_elem_km', 'distance_cap_gain_school_km', 'distance_public_school_elem_specialized_km', 'distance_public_school_mid_km', 'distance_public_school_ye_km', 'ListDate_dt_year', 'ListDate_dt_month', 'ListDate_dt_day']




In [41]:
# Ridge
model = Ridge()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print(list(features[sfm.get_support(indices=True)]))

['BasementY/N', 'Acres', 'SaleCount', 'FreddieMac15yr', 'FreddieMac5yrARM', 'PropertyLatitude', 'PropertyLongitude', 'count_public_school_arts_center_km', 'count_public_school_high_specialized_km', 'distance_public_school_elem_specialized_km', 'distance_public_school_special_ed_km', 'ListDate_dt_year']


In [42]:
# Elastic 

model = ElasticNet()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print(list(features[sfm.get_support(indices=True)]))

['Bedrooms', 'BathsFull', 'Levels', 'Fireplaces', 'BasementY/N', 'SaleCount', 'PctOfHomesDecreasingInValues', 'PriceIndex', 'mimi', 'ViolentCrimes', 'PctPoorPersons_2010_14', 'PctUnemployed_2010_14', 'count_metro_station_km', 'count_public_school_elem_km', 'distance_cap_gain_school_km', 'distance_public_school_elem_specialized_km', 'distance_public_school_mid_km', 'distance_public_school_ye_km', 'ListDate_dt_year', 'ListDate_dt_month']




## Dimensionality Reduction 

### Principal component analysis (PCA)

In [43]:
pca = PCA(n_components=2)
new_features = pca.fit(features).transform(features)
print(new_features)

[[-273675.39908336   40232.73482896]
 [-388863.05111241 -126121.94809521]
 [-323138.35041859 -112002.53462925]
 ..., 
 [-304654.73416208    6656.0861429 ]
 [-255956.01626167   27716.39666583]
 [-153193.62673337   15430.84824632]]


### Linear discriminant analysis (LDA)

In [45]:
lda = LDA(n_components=2)
new_features = lda.fit(features, labels).transform(features)
print(new_features)

[[-0.33563833 -0.01084479]
 [ 0.14932709  1.23048455]
 [ 0.31535581  1.11595512]
 ..., 
 [-0.7226007   0.64076217]
 [-0.85372974  0.25453788]
 [-1.65684798  0.69793743]]
