In [1]:
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import pandas as pd
import numpy as np
import patsy
from statsmodels.graphics.correlation import plot_corr
from sklearn.model_selection import train_test_split
plt.style.use('seaborn')

rawBostonData = pd.read_csv('.\Dataset\Boston.csv')
rawBostonData.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [2]:
rawBostonData = rawBostonData.dropna()
rawBostonData = rawBostonData.drop_duplicates()
list(rawBostonData.columns)

['CRIM',
 ' ZN ',
 'INDUS ',
 'CHAS',
 'NOX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'LSTAT',
 'MEDV']

In [3]:
renamedBostonData = rawBostonData.rename(columns = {
    'CRIM': 'crimeRatePerCapita',
    ' ZN ': 'non-retailLandProptn',
    'CHAS': 'riverDummy',
    'NOX': 'nitrixOxide_pp10m',
    'RM': 'AvgNo.RoomsPerDwelling',
    'AGE': 'ProptnOwnerOccupied',
    'DIS': 'weightedDist',
    'RAD': 'radialHighwaysAccess',
    'TAX': 'propTaxRate_per10K',
    'PTRATIO': 'pupilTeacherRatio',
    'LSTAT': 'pctLowerStatus',
    'MEDV': 'medianValue_Ks'
})

renamedBostonData.head()

Unnamed: 0,crimeRatePerCapita,non-retailLandProptn,INDUS,riverDummy,nitrixOxide_pp10m,AvgNo.RoomsPerDwelling,ProptnOwnerOccupied,weightedDist,radialHighwaysAccess,propTaxRate_per10K,pupilTeacherRatio,pctLowerStatus,medianValue_Ks
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [4]:
renamedBostonData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   crimeRatePerCapita      506 non-null    float64
 1   non-retailLandProptn    506 non-null    float64
 2   INDUS                   506 non-null    float64
 3   riverDummy              506 non-null    int64  
 4   nitrixOxide_pp10m       506 non-null    float64
 5   AvgNo.RoomsPerDwelling  506 non-null    float64
 6   ProptnOwnerOccupied     506 non-null    float64
 7   weightedDist            506 non-null    float64
 8   radialHighwaysAccess    506 non-null    int64  
 9   propTaxRate_per10K      506 non-null    int64  
 10  pupilTeacherRatio       506 non-null    float64
 11  pctLowerStatus          506 non-null    float64
 12  medianValue_Ks          506 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 75.3 KB


In [5]:
renamedBostonData.describe(include=[np.number]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
crimeRatePerCapita,506.0,3.613524,8.601545,0.00632,0.082045,0.25651,3.677082,88.9762
non-retailLandProptn,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
INDUS,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
riverDummy,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
nitrixOxide_pp10m,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
AvgNo.RoomsPerDwelling,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
ProptnOwnerOccupied,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
weightedDist,506.0,3.795043,2.10571,1.1296,2.100175,3.20745,5.188425,12.1265
radialHighwaysAccess,506.0,9.549407,8.707259,1.0,4.0,5.0,24.0,24.0
propTaxRate_per10K,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0
