In [2]:
import numpy as np
import pandas as pd
import time
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import *
from sklearn.linear_model import SGDRegressor
from sklearn import ensemble
from sklearn import svm
from sklearn import metrics
import os, sys
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston

In [3]:
boston=load_boston()
print(np.max(boston.target),np.min(boston.target),np.mean(boston.target))

50.0 5.0 22.532806324110677


In [4]:
df = pd.DataFrame(boston.data)

In [5]:
df.shape

(506, 13)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
0     506 non-null float64
1     506 non-null float64
2     506 non-null float64
3     506 non-null float64
4     506 non-null float64
5     506 non-null float64
6     506 non-null float64
7     506 non-null float64
8     506 non-null float64
9     506 non-null float64
10    506 non-null float64
11    506 non-null float64
12    506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB


In [8]:
print(boston.feature_names)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [2]:
X_train,X_test,y_train,y_test=train_test_split(boston.data,boston.target,test_size=0.25,random_state=33)

scalerX = StandardScaler().fit(X_train)
scalery = StandardScaler().fit(np.reshape(y_train,(-1,1)))

X_train = scalerX.transform(X_train)
y_train = scalery.transform(np.reshape(y_train,(-1,1)))
X_test = scalerX.transform(X_test)
y_test = scalery.transform(np.reshape(y_test,(-1,1)))

print (np.max(X_train), np.min(X_train), np.mean(X_train), np.max(y_train), np.min(y_train), np.mean(y_train))
# converting reshaped Y vector into array again
y_train=y_train.flatten(order='C')
y_test=y_test.flatten(order='C')

10.2028980046 -4.66702040845 2.47038706385e-15 2.91774920367 -1.93147098641 3.58552238032e-16




In [3]:
def train_and_evaluate(clf, X_train, y_train):
    
    clf.fit(X_train, y_train)
    
    print ("Coefficient of determination on training set:",clf.score(X_train, y_train))
    
    cv = KFold(X_train.shape[0], 5, shuffle=True, random_state=33)
    scores = cross_val_score(clf, X_train, y_train, cv=cv)
    print ("Average coefficient of determination using 5-fold crossvalidation:",np.mean(scores))

In [4]:
clf1=SGDRegressor(loss='squared_loss', penalty=None,  random_state=33)
train_and_evaluate(clf1,X_train,y_train)
print (clf1.coef_)

Coefficient of determination on training set: 0.740281703689
Average coefficient of determination using 5-fold crossvalidation: 0.713630596255
[-0.07634694  0.06117706 -0.03404977  0.1076101  -0.06620428  0.35855438
 -0.0098127  -0.21344242  0.0921319  -0.03985987 -0.18753121  0.05267773
 -0.37137355]




In [5]:
clf2 = SGDRegressor(loss='squared_loss', penalty='l2',  random_state=42)
train_and_evaluate(clf2,X_train,y_train)

Coefficient of determination on training set: 0.743616743208
Average coefficient of determination using 5-fold crossvalidation: 0.71081206667




In [6]:
clf3 = SGDRegressor(loss='squared_loss', penalty='l1',  random_state=42)
train_and_evaluate(clf3,X_train,y_train)

Coefficient of determination on training set: 0.74358692291
Average coefficient of determination using 5-fold crossvalidation: 0.710763609874




In [7]:
svr1= svm.SVR(kernel='linear')
train_and_evaluate(svr1,X_train,y_train)

Coefficient of determination on training set: 0.71886923342
Average coefficient of determination using 5-fold crossvalidation: 0.707838419194


In [8]:
svr2=svm.SVR(kernel='poly')
train_and_evaluate(svr2,X_train,y_train)

Coefficient of determination on training set: 0.904109273301
Average coefficient of determination using 5-fold crossvalidation: 0.779288545488


In [9]:
svr3=svm.SVR(kernel='rbf')
train_and_evaluate(svr3,X_train,y_train)

Coefficient of determination on training set: 0.900132065979
Average coefficient of determination using 5-fold crossvalidation: 0.833662221567


In [10]:
et1=ensemble.ExtraTreesRegressor(n_estimators=10,random_state=42)
train_and_evaluate(et1,X_train,y_train)

Coefficient of determination on training set: 1.0
Average coefficient of determination using 5-fold crossvalidation: 0.861758978344


In [11]:
important=zip(et1.feature_importances_,boston.feature_names)
print (sorted(important))

[(0.0050438532027558842, 'ZN'), (0.015142513715149682, 'B'), (0.017052578400506287, 'AGE'), (0.018941821085751577, 'RAD'), (0.023602561777571307, 'CHAS'), (0.025733049004581798, 'CRIM'), (0.031874162235100457, 'NOX'), (0.034405644939308928, 'INDUS'), (0.039713133345196064, 'DIS'), (0.046618521397262996, 'TAX'), (0.099511801492762245, 'PTRATIO'), (0.28421522796368465, 'LSTAT'), (0.35814513144036819, 'RM')]


In [12]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True,
                        show_confusion_matrix=True, show_r2_score=False):
    y_pred=clf.predict(X)   
    if show_accuracy:
        print ("Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred)),"\n")

    if show_classification_report:
        print ("Classification report")
        print (metrics.classification_report(y,y_pred),"\n")
        
    if show_confusion_matrix:
        print ("Confusion matrix")
        print (metrics.confusion_matrix(y,y_pred),"\n")
        
    if show_r2_score:
        print ("Coefficient of determination:{0:.3f}".format(metrics.r2_score(y,y_pred)),"\n")

        
measure_performance(X_test,y_test,et1, show_accuracy=False, show_classification_report=False,
                    show_confusion_matrix=False, show_r2_score=True)

Coefficient of determination:0.802 

