In [1]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
import numpy as np

In [63]:
import os

# **Data Gathering**

In [3]:
# Gathering the Boston data from the Stat CMU website
response= requests.get('http://lib.stat.cmu.edu/datasets/boston')
response

<Response [200]>

In [89]:
# Storing the data in the local harddrive
my_folder = r'C:\Users\srini\OneDrive\Documents\Udacity'
file_name = 'boston_dataset.txt'

with open(os.path.join(my_folder,file_name), 'wb' ) as file:
    file.write(response.content)

In [94]:
# Estimating the number of rows in the dataset which will be useful for inputting as a dataframe
my_text= str(response.content)
my_text.count('\\n')

1034

**The columns in the dataset** <br>
CRIM     per capita crime rate by town <br> 
ZN       proportion of residential land zoned for lots over 25,000 sq.ft. <br> 
INDUS    proportion of non-retail business acres per town <br> 
CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) <br> 
NOX      nitric oxides concentration (parts per 10 million) <br> 
RM       average number of rooms per dwelling<br> 
AGE      proportion of owner-occupied units built prior to 1940<br> 
DIS      weighted distances to five Boston employment centres<br> 
RAD      index of accessibility to radial highways<br> 
TAX      full-value property-tax rate per 10000 <br> 
PTRATIO  pupil-teacher ratio by town <br> 
B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town <br> 
LSTAT    percent lower status of the population <br> 
MEDV     Median value of owner-occupied homes in 1000's<br> 

In [46]:
# Setting up the column name
my_col_name = ['crime', 'zone', 'industrial', 'chas_river', 'nox', 'avg_room', 'age_house', 'distance', 'radial', 'tax',\
               'pupil_teacher', 'blacks', 'lstat', 'value_home' ]

In [102]:
boston1= pd.read_csv(os.path.join(my_folder,file_name), sep= '\t')
boston1.head(15)
# The first 20 lines are the introductory note in the dataset and the the description for the columns

boston1.head(25).tail(5)
# It seems that there are 11 columns in 1 row and the next 3 columns in the subsequent row.

Unnamed: 0,"The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic"
20,396.90 4.98 24.00
21,0.02731 0.00 7.070 0 0.4690 6.4210 78...
22,396.90 9.14 21.60
23,0.02729 0.00 7.070 0 0.4690 7.1850 61...
24,392.83 4.03 34.70


In [98]:
# Reading the locally stored file. Since the data has 11 columns in 1 row and the next 3 columns in the subsequent row,
# we need to extract data as 2 seperate dataframes and merge them later

boston1= pd.read_csv(os.path.join(my_folder,file_name),delim_whitespace= True, index_col= False, \
            skiprows= list(np.arange(21))+list(np.arange(21,1031,2)),\
                     engine= 'python',header= None, names = my_col_name[:-3])

boston2= pd.read_csv(os.path.join(my_folder,file_name),delim_whitespace= True, index_col= False, \
            skiprows= list(np.arange(22))+list(np.arange(22,1031,2)),\
                     engine= 'python',header= None, names = my_col_name[-3:])

print(boston1.head(2))
print(boston2.head(2))

     crime  zone  industrial  chas_river    nox  avg_room  age_house  \
0  0.00632  18.0        2.31         0.0  0.538     6.575       65.2   
1  0.02731   0.0        7.07         0.0  0.469     6.421       78.9   

   distance  radial    tax  pupil_teacher  
0    4.0900     1.0  296.0           15.3  
1    4.9671     2.0  242.0           17.8  
   blacks  lstat  value_home
0   396.9   4.98        24.0
1   396.9   9.14        21.6


In [62]:
# Joining both the datasets together to form a tidy dataframe
boston = boston1.join(boston2, )
boston.head()

Unnamed: 0,crime,zone,industrial,chas_river,nox,avg_room,age_house,distance,radial,tax,pupil_teacher,blacks,lstat,value_home
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


# **Data Wrangling**

In [103]:
boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508 entries, 0 to 507
Data columns (total 14 columns):
crime            508 non-null float64
zone             508 non-null float64
industrial       508 non-null float64
chas_river       506 non-null float64
nox              506 non-null float64
avg_room         506 non-null float64
age_house        506 non-null float64
distance         506 non-null float64
radial           506 non-null float64
tax              506 non-null float64
pupil_teacher    506 non-null float64
blacks           507 non-null float64
lstat            507 non-null float64
value_home       507 non-null float64
dtypes: float64(14)
memory usage: 55.7 KB


In [104]:
boston.describe()

Unnamed: 0,crime,zone,industrial,chas_river,nox,avg_room,age_house,distance,radial,tax,pupil_teacher,blacks,lstat,value_home
count,508.0,508.0,508.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,507.0,507.0,507.0
mean,5.155104,11.347165,11.159665,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,355.970626,12.628107,22.511893
std,26.002682,23.277925,6.86383,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,92.569615,7.156099,9.20007
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.04741,0.0,5.0
25%,0.082155,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.27,6.925,16.9
50%,0.26042,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.43,11.34,21.2
75%,3.69407,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.22,16.95,25.0
max,396.9,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


 **Missing data**
There are just 1 or 2 missing values in the dataset for each column. We can ignore these values as such

In [107]:
# Changing the River data type to categorical values
boston['chas_river']= boston['chas_river'].astype('category')