In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [2]:
DATA_DIR = '../MIS 776'
FILE_NAME = 'Housing.csv'
data_path = os.path.join(DATA_DIR, FILE_NAME)
datHousing = pd.read_csv(data_path)

In [3]:
datHousing = pd.concat([datHousing, pd.get_dummies(datHousing['RIVER'], prefix='river', drop_first=True)], axis=1)
datHousing.drop(['RIVER'], inplace=True, axis=1)
datHousing.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,LSTAT,MEDV,river_Yes
0,3.32105,0.0,19.58,0.871,5.403,100.0,1.3216,5,403,14.7,26.82,13.4,1
1,1.12658,0.0,19.58,0.871,5.012,88.0,1.6102,5,403,14.7,12.12,15.3,1
2,1.41385,0.0,19.58,0.871,6.129,96.0,1.7494,5,403,14.7,15.12,17.0,1
3,3.53501,0.0,19.58,0.871,6.152,82.6,1.7455,5,403,14.7,15.02,15.6,1
4,1.27346,0.0,19.58,0.605,6.25,92.6,1.7984,5,403,14.7,5.5,27.0,1


In [4]:
np.mean(datHousing['river_Yes'])

0.0691699604743083

In [5]:
X = datHousing.drop(['MEDV'], axis=1)
X.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,LSTAT,river_Yes
0,3.32105,0.0,19.58,0.871,5.403,100.0,1.3216,5,403,14.7,26.82,1
1,1.12658,0.0,19.58,0.871,5.012,88.0,1.6102,5,403,14.7,12.12,1
2,1.41385,0.0,19.58,0.871,6.129,96.0,1.7494,5,403,14.7,15.12,1
3,3.53501,0.0,19.58,0.871,6.152,82.6,1.7455,5,403,14.7,15.02,1
4,1.27346,0.0,19.58,0.605,6.25,92.6,1.7984,5,403,14.7,5.5,1


In [6]:
y=datHousing['MEDV']
y.head()

0    13.4
1    15.3
2    17.0
3    15.6
4    27.0
Name: MEDV, dtype: float64

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [8]:
# 1. Import the Estimator class you will use
from sklearn.linear_model import LinearRegression
# 2. Create an instance of the class
ml_reg = LinearRegression()
# 3. Use the fit method of the instance
ml_reg.fit(X_train, y_train)
# 4. Use the predict method to get the predictions
y_pred_ml_reg = ml_reg.predict(X_train)

In [9]:
pd.Series(ml_reg.coef_, index=X.columns).sort_values(ascending=False).round(3)

RM            3.614
river_Yes     3.252
RAD           0.246
ZN            0.048
AGE           0.007
INDUS        -0.009
TAX          -0.011
CRIM         -0.133
LSTAT        -0.566
PRATIO       -0.936
DIS          -1.425
NOX         -15.199
dtype: float64

In [10]:
ml_reg.intercept_.round(3)

39.438

In [11]:
from sklearn.metrics import mean_squared_error, r2_score
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_train, y_pred_ml_reg))

Mean squared error: 21.03


In [12]:
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_train, y_pred_ml_reg))

Variance score: 0.74


In [13]:
datHousingSub = datHousing.drop(['MEDV'], axis=1)

In [14]:
datHousingSub.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,LSTAT,river_Yes
0,3.32105,0.0,19.58,0.871,5.403,100.0,1.3216,5,403,14.7,26.82,1
1,1.12658,0.0,19.58,0.871,5.012,88.0,1.6102,5,403,14.7,12.12,1
2,1.41385,0.0,19.58,0.871,6.129,96.0,1.7494,5,403,14.7,15.12,1
3,3.53501,0.0,19.58,0.871,6.152,82.6,1.7455,5,403,14.7,15.02,1
4,1.27346,0.0,19.58,0.605,6.25,92.6,1.7984,5,403,14.7,5.5,1


In [15]:
# Using scikit-learn to perform K-Means clustering
from sklearn.cluster import KMeans
    
# Specify the number of clusters (2) and fit the data datHousingSub
kmeans = KMeans(n_clusters=2, random_state=0).fit(datHousingSub)

In [16]:
centroids = (kmeans.cluster_centers_)
print(centroids)

[[3.88774444e-01 1.55826558e+01 8.42089431e+00 5.11847425e-01
  6.38800542e+00 6.06322493e+01 4.44127154e+00 4.45528455e+00
  3.11926829e+02 1.78092141e+01 1.04174526e+01 7.31707317e-02]
 [1.22991617e+01 3.01980663e-14 1.84518248e+01 6.70102190e-01
  6.00621168e+00 8.99678832e+01 2.05447007e+00 2.32700730e+01
  6.67642336e+02 2.01963504e+01 1.86745255e+01 5.83941606e-02]]


In [17]:
labels = (kmeans.labels_)
print(labels)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

In [18]:
# Calculate silhouette_score
from sklearn.metrics import silhouette_score

print(silhouette_score(datHousingSub, kmeans.labels_))

0.7717962604192585


In [19]:
# Specify the number of clusters (3) and fit the data datHousingSub
kmeans = KMeans(n_clusters=3, random_state=0).fit(datHousingSub)

In [20]:
centroids = (kmeans.cluster_centers_)
print(centroids)

[[2.44205703e-01 1.73764259e+01 6.70262357e+00 4.84713688e-01
  6.47416350e+00 5.61661597e+01 4.83579772e+00 4.32699620e+00
  2.75212928e+02 1.78733840e+01 9.55292776e+00 7.60456274e-02]
 [1.22991617e+01 3.01980663e-14 1.84518248e+01 6.70102190e-01
  6.00621168e+00 8.99678832e+01 2.05447007e+00 2.32700730e+01
  6.67642336e+02 2.01963504e+01 1.86745255e+01 5.83941606e-02]
 [7.47468585e-01 1.11320755e+01 1.26841509e+01 5.79169811e-01
  6.17423585e+00 7.17132075e+01 3.46240000e+00 4.77358491e+00
  4.03018868e+02 1.76500000e+01 1.25624528e+01 6.60377358e-02]]


In [21]:
labels = (kmeans.labels_)
print(labels)

[2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 0 0 2 2
 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 0
 0 0 0 0 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 0 0 0 2 2 0 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

In [22]:
# Calculate silhouette_score
from sklearn.metrics import silhouette_score

print(silhouette_score(datHousingSub, kmeans.labels_))

0.6217529916045139


In [23]:
# Specify the number of clusters (2) and fit the data datHousingSub
kmeans = KMeans(n_clusters=2, random_state=0).fit(datHousingSub)

In [24]:
centroids = (kmeans.cluster_centers_)
print(centroids)

[[3.88774444e-01 1.55826558e+01 8.42089431e+00 5.11847425e-01
  6.38800542e+00 6.06322493e+01 4.44127154e+00 4.45528455e+00
  3.11926829e+02 1.78092141e+01 1.04174526e+01 7.31707317e-02]
 [1.22991617e+01 3.01980663e-14 1.84518248e+01 6.70102190e-01
  6.00621168e+00 8.99678832e+01 2.05447007e+00 2.32700730e+01
  6.67642336e+02 2.01963504e+01 1.86745255e+01 5.83941606e-02]]


In [25]:
labels = (kmeans.labels_)
df_labels = pd.DataFrame(labels)
df_labels.rename(columns={0:'Cluster'}, inplace=True)
df_labels.head()

Unnamed: 0,Cluster
0,0
1,0
2,0
3,0
4,0


In [26]:
datHousing_Clust = pd.concat([datHousing, df_labels], axis=1)

In [27]:
datHousing_Clust.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,LSTAT,MEDV,river_Yes,Cluster
0,3.32105,0.0,19.58,0.871,5.403,100.0,1.3216,5,403,14.7,26.82,13.4,1,0
1,1.12658,0.0,19.58,0.871,5.012,88.0,1.6102,5,403,14.7,12.12,15.3,1,0
2,1.41385,0.0,19.58,0.871,6.129,96.0,1.7494,5,403,14.7,15.12,17.0,1,0
3,3.53501,0.0,19.58,0.871,6.152,82.6,1.7455,5,403,14.7,15.02,15.6,1,0
4,1.27346,0.0,19.58,0.605,6.25,92.6,1.7984,5,403,14.7,5.5,27.0,1,0


In [28]:
datHousingC1 = datHousing_Clust.loc[datHousing_Clust['Cluster']==0]
datHousingC2 = datHousing_Clust.loc[datHousing_Clust['Cluster']==1]

In [29]:
datHousingC1.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,LSTAT,MEDV,river_Yes,Cluster
0,3.32105,0.0,19.58,0.871,5.403,100.0,1.3216,5,403,14.7,26.82,13.4,1,0
1,1.12658,0.0,19.58,0.871,5.012,88.0,1.6102,5,403,14.7,12.12,15.3,1,0
2,1.41385,0.0,19.58,0.871,6.129,96.0,1.7494,5,403,14.7,15.12,17.0,1,0
3,3.53501,0.0,19.58,0.871,6.152,82.6,1.7455,5,403,14.7,15.02,15.6,1,0
4,1.27346,0.0,19.58,0.605,6.25,92.6,1.7984,5,403,14.7,5.5,27.0,1,0


In [30]:
datHousingC2.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,LSTAT,MEDV,river_Yes,Cluster
27,8.98296,0.0,18.1,0.77,6.212,97.4,2.1222,24,666,20.2,17.6,17.8,1,1
28,3.8497,0.0,18.1,0.77,6.395,91.0,2.5052,24,666,20.2,13.27,21.7,1,1
29,5.20177,0.0,18.1,0.77,6.127,83.4,2.7227,24,666,20.2,11.48,22.7,1,1
30,4.22239,0.0,18.1,0.77,5.803,89.0,1.9047,24,666,20.2,14.64,16.8,1,1
31,3.47428,0.0,18.1,0.718,8.78,82.9,1.9047,24,666,20.2,5.29,21.9,1,1


In [31]:
X = datHousingC1.drop(['MEDV'], axis=1)
X.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,LSTAT,river_Yes,Cluster
0,3.32105,0.0,19.58,0.871,5.403,100.0,1.3216,5,403,14.7,26.82,1,0
1,1.12658,0.0,19.58,0.871,5.012,88.0,1.6102,5,403,14.7,12.12,1,0
2,1.41385,0.0,19.58,0.871,6.129,96.0,1.7494,5,403,14.7,15.12,1,0
3,3.53501,0.0,19.58,0.871,6.152,82.6,1.7455,5,403,14.7,15.02,1,0
4,1.27346,0.0,19.58,0.605,6.25,92.6,1.7984,5,403,14.7,5.5,1,0


In [32]:
y=datHousingC1['MEDV']
y.head()

0    13.4
1    15.3
2    17.0
3    15.6
4    27.0
Name: MEDV, dtype: float64

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [34]:
# 1. Import the Estimator class you will use
from sklearn.linear_model import LinearRegression
# 2. Create an instance of the class
ml_reg = LinearRegression()
# 3. Use the fit method of the instance
ml_reg.fit(X_train, y_train)
# 4. Use the predict method to get the predictions
y_pred_ml_reg = ml_reg.predict(X_train)

In [35]:
pd.Series(ml_reg.coef_, index=X.columns).sort_values(ascending=False).round(3)

RM           9.168
river_Yes    1.045
CRIM         1.033
RAD          0.166
ZN           0.023
Cluster      0.000
INDUS       -0.001
TAX         -0.015
AGE         -0.045
LSTAT       -0.084
PRATIO      -0.530
DIS         -0.836
NOX         -6.519
dtype: float64

In [36]:
ml_reg.intercept_.round(3)

-10.54

In [37]:
from sklearn.metrics import mean_squared_error, r2_score
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_train, y_pred_ml_reg))

Mean squared error: 9.95


In [38]:
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_train, y_pred_ml_reg))

Variance score: 0.86


In [39]:
X = datHousingC2.drop(['MEDV'], axis=1)
X.head()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,LSTAT,river_Yes,Cluster
27,8.98296,0.0,18.1,0.77,6.212,97.4,2.1222,24,666,20.2,17.6,1,1
28,3.8497,0.0,18.1,0.77,6.395,91.0,2.5052,24,666,20.2,13.27,1,1
29,5.20177,0.0,18.1,0.77,6.127,83.4,2.7227,24,666,20.2,11.48,1,1
30,4.22239,0.0,18.1,0.77,5.803,89.0,1.9047,24,666,20.2,14.64,1,1
31,3.47428,0.0,18.1,0.718,8.78,82.9,1.9047,24,666,20.2,5.29,1,1


In [40]:
y=datHousingC2['MEDV']
y.head()

27    17.8
28    21.7
29    22.7
30    16.8
31    21.9
Name: MEDV, dtype: float64

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [42]:
# 1. Import the Estimator class you will use
from sklearn.linear_model import LinearRegression
# 2. Create an instance of the class
ml_reg = LinearRegression()
# 3. Use the fit method of the instance
ml_reg.fit(X_train, y_train)
# 4. Use the predict method to get the predictions
y_pred_ml_reg = ml_reg.predict(X_train)

In [43]:
pd.Series(ml_reg.coef_, index=X.columns).sort_values(ascending=False).round(3)

RAD          1.073362e+11
TAX          7.370158e+10
river_Yes    1.032100e+01
AGE          1.400000e-02
Cluster      0.000000e+00
CRIM        -1.380000e-01
LSTAT       -8.440000e-01
RM          -9.750000e-01
DIS         -3.321000e+00
NOX         -3.621100e+01
ZN          -7.619852e+07
PRATIO      -6.187353e+09
INDUS       -1.214176e+11
dtype: float64

In [44]:
ml_reg.intercept_.round(3)

-49338680212718.555

In [45]:
from sklearn.metrics import mean_squared_error, r2_score
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_train, y_pred_ml_reg))

Mean squared error: 20.13


In [46]:
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_train, y_pred_ml_reg))

Variance score: 0.73
