In [1]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
from sklearn.linear_model import RandomizedLasso
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from time import time
from ggplot import *

In [2]:
df = pd.read_csv("4031BostonData.csv")

In [3]:
df.columns.values

array(['TOWN', 'MEDV', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
       'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT'], dtype=object)

In [5]:
medv = df.ix[:,'MEDV']
crim = df.ix[:,'CRIM']
zn = df.ix[:,'ZN']
ind = df.ix[:,'INDUS']
nox = df.ix[:,'NOX']
rm = df.ix[:,'RM']
age = df.ix[:,'AGE']
dis = df.ix[:,'DIS']
tax = df.ix[:,'TAX']
pt = df.ix[:,'PTRATIO']
lstat = df.ix[:,'LSTAT']

chas = df.ix[:,'CHAS']

In [6]:
rad = pd.get_dummies(df.ix[:,'RAD'], prefix = 'rad')

In [8]:
clean_data = pd.concat([crim, zn, ind, nox, rm, age, dis, tax, pt, lstat, chas], axis=1)
target = medv

In [9]:
clean_data

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,LSTAT,CHAS
0,0.00632,18.0,2.31,0.538,6.575,65.2,4.0900,296,15.3,4.98,0
1,0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,242,17.8,9.14,0
2,0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,242,17.8,4.03,0
3,0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,222,18.7,2.94,0
4,0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,222,18.7,5.33,0
5,0.02985,0.0,2.18,0.458,6.430,58.7,6.0622,222,18.7,5.21,0
6,0.08829,12.5,7.87,0.524,6.012,66.6,5.5605,311,15.2,12.43,0
7,0.14455,12.5,7.87,0.524,6.172,96.1,5.9505,311,15.2,19.15,0
8,0.21124,12.5,7.87,0.524,5.631,100.0,6.0821,311,15.2,29.93,0
9,0.17004,12.5,7.87,0.524,6.004,85.9,6.5921,311,15.2,17.10,0


In [10]:
X = clean_data.ix[:,:]
Y = target
names = clean_data.columns.values

In [11]:
rlasso = RandomizedLasso(alpha=0.01)
rlasso.fit(X, Y)
print "Features sorted by their score: (Stability Selection)"
stable = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True)
print stable

Features sorted by their score: (Stability Selection)
[(1.0, 'RM'), (1.0, 'PTRATIO'), (1.0, 'LSTAT'), (1.0, 'DIS'), (0.975, 'CHAS'), (0.885, 'NOX'), (0.84, 'CRIM'), (0.555, 'TAX'), (0.475, 'ZN'), (0.345, 'INDUS'), (0.04, 'AGE')]


In [12]:
lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=1)
rfe.fit(X,Y)
print "Features sorted by their rank: (RFE)"
rfe = sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))
print rfe

Features sorted by their rank: (RFE)
[(1.0, 'NOX'), (2.0, 'RM'), (3.0, 'CHAS'), (4.0, 'PTRATIO'), (5.0, 'DIS'), (6.0, 'LSTAT'), (7.0, 'CRIM'), (8.0, 'INDUS'), (9.0, 'ZN'), (10.0, 'AGE'), (11.0, 'TAX')]


In [19]:
#training set
train_data = clean_data.ix[:500,:]
train_nums = target.ix[:500]

#testing set
test_data = clean_data.ix[501:,:]
test_target = target.ix[501:]

llr = LinearRegression()
llr.fit(train_data,train_nums)
print(llr.coef_)

[ -8.07277840e-02   4.36282057e-02  -5.64346741e-02  -1.55644595e+01
   3.94061078e+00   7.00392225e-04  -1.54167995e+00  -7.48228288e-04
  -7.59627604e-01  -5.58476886e-01   3.12772566e+00]


In [28]:
train_data.columns

Index([u'CRIM', u'ZN', u'INDUS', u'NOX', u'RM', u'AGE', u'DIS', u'TAX',
       u'PTRATIO', u'LSTAT', u'CHAS'],
      dtype='object')

In [29]:
llr.predict([12,3,4,5,4,6,7,8,9,2,0])

array([-48.01166848])