In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
import warnings #to remove warning from the notebook
warnings.filterwarnings(action='ignore')

In [None]:
#loading dataset
name= ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = pd.read_csv('../input/boston-house-prices/housing.csv',delim_whitespace=True,names=name)
df.head()

# Review Boston House prices dataset

In [None]:
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
df.info()

- There are no missing values present in this dataset
- All the columns have numrical values, hence we dont have to do encoding for categorical values, in order to perform Linear Regression

In [None]:
df.isnull().sum()

- There are no null values 

In [None]:
df.corr()

Observations:
- INDUS, RM, TAX, PTRATIO and LSTAT shows fairly good correlation with MEDV

In [None]:
plt.figure(figsize=(12,9))
sns.heatmap(data=df.corr().round(2),annot=True,linewidths=0.2,square=True)
plt.show()

Observations:
- NOX shows goor corr with INDUS and AGE
- INDUS shows good corr with LSAT and DIS
- DIS shows stron corr with INDUS, RM and AGE 

Hence Multicollinearity exists in this dataset

# Choose 2 features to predict the target
Here we choose RM and LSTAT as the 2 features

In [None]:
df1 = df[['RM','LSTAT','MEDV']]
df1.head()

In [None]:
sns.pairplot(data=df1)

Observations:
- RM is normally distributed as it's histogram is a bell shaped curve, but there are very few outliers towards both the ends
- LSTAT shows quite a negatively skewed graph
- MEDV has a normally distributed graph with outliers present between the range 40-50
- Position Linear correlation is present between RM and MEDV. There are a few outliers present near 50
- RM and LSTAT, and MEDV and LSTAT have negative linear relationship between them, alongith the presence of few outliers

In [None]:
#description about this data
df1.describe().round(2)

Observations:
- We can see count of entries for each variable is
same i.e. 506.
- Maximum value in MEDV and LSTAT are much higher than 75% of data points

For RM & MEDV:
- The difference between the min and 50% quartile, and between 50% and max value is almost the equal
- The mean and 50% value is approximately same
- Hence RM and MEDV have Normal Distribution for their graphs

For LSTAT:
- The difference between the min and 50% quartile, and between 50% and max value is unequal
- There is a significant difference between the mean and 50% quartile value
- Hence LSTAT does not have normal distribution

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Dependent variable MEDV
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.MEDV)
plt.title('Box Plot of MEDV')

plt.subplot(1,3,2)
sns.distplot(a=df1.MEDV)
plt.title('Distribution Plot of MEDV')

plt.subplot(1,3,3)
sns.scatterplot(df1.MEDV,df1.MEDV)
plt.title('Scatter Plot of MEDV vs MEDV')
plt.show()

- MEDV is normally distributed
- It contains some extreme values which could be potential outliers, especially near 50

Hence we have to clean this data by removing outliers


## REMOVING OUTLIERS

In [None]:
df2 = df1[~(df1['MEDV']==50)]
df2

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of MEDV column is 48.80
- Hence we have deleted 16 (506-490) rows from out dataset having MEDV value as 50

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Independent variable RM
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.RM)
plt.title('Box Plot of RM')

plt.subplot(1,3,2)
sns.distplot(a=df1.RM)
plt.title('Distribution Plot of RM')

plt.subplot(1,3,3)
sns.scatterplot(df1.RM,df1.MEDV)
plt.title('Scatter Plot of RM vs MEDV')
plt.show()

Observations:
- Graph of RM is normally distributed
- There are some outliers present lower and higher end of RM values in the dataset
- Scatter plot of RM vs MEDV show good Positive Linear Relationship.

In [None]:
temp_df = df2[df1['RM']>7.7]
temp_df.shape

In [None]:
temp_df1 = df2[df1['RM']<4.7]
temp_df1.shape

In [None]:
df2 = df2[~(df1['RM']>7.7)]
df2

In [None]:
df2 = df2[~(df1['RM']<4.7)]
df2

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of RM column is 7.69
- Hence we have deleted 36 (506-470) rows from out dataset having RM value as >7.7 & < 4.7

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Independent variable LSTAT
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.LSTAT)
plt.title('Box Plot of LSTAT')

plt.subplot(1,3,2)
sns.distplot(a=df1.LSTAT)
plt.title('Distribution Plot of LSTAT')

plt.subplot(1,3,3)
sns.scatterplot(df1.LSTAT,df1.MEDV)
plt.title('Scatter Plot of LSTAT vs MEDV')
plt.show()

In [None]:
temp_df1 = df2[df1['LSTAT']>31]
temp_df1.shape

In [None]:
df2 = df2[~(df1['LSTAT']>31)]
df2

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of LSTAT column is 30.81
- Hence we have deleted 40 (506-470) rows from out dataset having LSTAT value as >31

## SPLITTING THE DATASET

In [None]:
#Now will split our dataset into Dependent variable and Independent variable
x = df2.iloc[:,0:2].values
y = df2.iloc[:,-1:].values

In [None]:
print("Shape of Independent variable, x :",x.shape)
print("Shape of Dependent variable, y :",y.shape)

### FEATURE SCALING using sklearn

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)

In [None]:
x

In [None]:
x_scaled = scaler.transform(x)
x_scaled

In [None]:
m,n = x_scaled.shape
x_scaled = np.append(arr=np.ones((m,1)),values=x_scaled,axis=1)
x_scaled

- Since we need to add a variable for Bias, we add a new column of 1's in X as the first column.


In [None]:
x=x_scaled

### TEST & TRAIN DATASET (80-20)
> We split the data into Training Set (80% of total data) and Test Set (20% of total data)







In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model

In [None]:
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_11 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_12 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_11)
print("R^2 value: ",test_set_r2_12)

- We know, the higher the R-squared value, the more accurately the regression equation models your data
- ALso, RMSE measures how accurately the model predicts the response, hence it's an important criterion for fit if the main purpose of the model is prediction.
- This is a good regression model as the R square score is near 1.0, and the RMSE error is not very large.

### Scatter plot of predicted vs actual test house prices along with the Regression Line

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

- From the graph, we infer that there's positive linear relationship with the regression fit line
- Looking at how the regression line fits in with the scatter plot, some of the actual value points are above the line, and some are below
- But overall, this model fits the data well, as there is fairly small difference between majority of the datapoints and the best fit line

### TEST & TRAIN DATASET (60-40)
> We split the data into Training Set (60% of total data) and Test Set (40% of total data)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.4,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model

In [None]:
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_13 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_14 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_13)
print("R^2 value: ",test_set_r2_14)

- This is a fairly good regression model as the R square score is near 1.0, and the RMSE error is not very large.

**Scatter plot of predicted vs actual test house prices along with the Regression Line**

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

- From the graph, we infer that there's positive linear relationship with the regression fit line
- Looking at how the regression line fits in with the scatter plot, some of the actual value points are above the line, and some are below
- This model fits does not really the data well, as there are difference present between many of the datapoints and the best fit line

# Choose 5 sets of features to predict

```
- 1st set : RM and TAX
- 2nd set : LSTAT and PTRATIO
- 3rd set : RM and PTRATIO
- 4th set : LSTAT, TAX and PTRATIO
- 5th set : RM, LSTAT and PTRATIO
```



## 1st set : RM and TAX

In [None]:
df1 = df[['RM','TAX','MEDV']]
df1.head()

In [None]:
sns.pairplot(data=df1)

Observations:
- RM is normally distributed as it's histogram is a bell shaped curve, but there are very few outliers towards both the ends
- TAX does not show normal distribution
- MEDV has a normally distributed graph with outliers present between the range 40-50
- Positive Linear correlation is present between RM and MEDV. There are a few outliers present near 50
- There is no relation between MEDV and TAX

In [None]:
df1.describe().round(2)

Observations:
- We can see count of entries for each variable is
same i.e. 506.
- Maximum value in MEDV is much higher than 75% of data points

For RM & MEDV:
- The difference between the min and 50% quartile, and between 50% and max value is almost the equal
- The mean and 50% value is approximately same
- Hence RM and MEDV have Normal Distribution for their graphs

For TAX:
- The difference between the min and 50% quartile, and between 50% and max value is unequal
- There is a significant difference between the mean and 50% quartile value
- Hence TAX does not have normal distribution

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Dependent variable MEDV
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.MEDV)
plt.title('Box Plot of MEDV')

plt.subplot(1,3,2)
sns.distplot(a=df1.MEDV)
plt.title('Distribution Plot of MEDV')

plt.subplot(1,3,3)
sns.scatterplot(df1.MEDV,df1.MEDV)
plt.title('Scatter Plot of MEDV vs MEDV')
plt.show()

- MEDV is normally distributed
- It contains some extreme values which could be potential outliers, especially near 50

Hence we have to clean this data by removing outliers

In [None]:
df2 = df1[~(df1['MEDV']==50)]
df2

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of MEDV column is 48.80
- Hence we have deleted 16 (506-490) rows from out dataset having MEDV value as 50

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Independent variable RM
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.RM)
plt.title('Box Plot of RM')

plt.subplot(1,3,2)
sns.distplot(a=df1.RM)
plt.title('Distribution Plot of RM')

plt.subplot(1,3,3)
sns.scatterplot(df1.RM,df1.MEDV)
plt.title('Scatter Plot of RM vs MEDV')
plt.show()

Observations:
- Graph of RM is normally distributed
- There are some outliers present lower and higher end of RM values in the dataset
- Scatter plot of RM vs MEDV show fairly good Positive Linear Relationship.

In [None]:
df2 = df2[~(df1['RM']>7.7)]

In [None]:
df2 = df2[~(df1['RM']<4.7)]

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of RM column is 7.69
- Hence we have deleted 36 (506-470) rows from out dataset having RM value as >7.7 & < 4.7

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for TAX
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.TAX)
plt.title('Box Plot of TAX')

plt.subplot(1,3,2)
sns.distplot(a=df1.TAX)
plt.title('Distribution Plot of TAX')

plt.subplot(1,3,3)
sns.scatterplot(df1.TAX,df1.MEDV)
plt.title('Scatter Plot of TAX vs MEDV')
plt.show()

Observations:
- Graph of TAX is NOT normally distributed
- Though Boxplot does not show any outlier but there are some extreme TAX values in the dataset
- From the scatter plot we can observe that for these extreme TAX values, and MEDV ranges from low to high.


In [None]:
temp_df = df2[df1['TAX']>600]
temp_df.shape

- There are total 123 entries in TAX mostly having value 666


In [None]:
temp_df.describe()

Observations:
- RM for these entries lies between 4.88 to 7.39
- MEDV for these entries lies between 14.93 to 29.80.
- It seems impossible to have such high TAX values for all these houses.
- These values most likely missing values which were imputed casually by someone

Hence, we cannot remove so many values from our dataset

In [None]:
#Now we split our dataset into Dependent variable and Independent variable
x = df2.iloc[:,0:2].values
y = df2.iloc[:,-1:].values

In [None]:
print("Shape of Independent variable, x :",x.shape)
print("Shape of Dependent variable, y :",y.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)

In [None]:
x

In [None]:
x_scaled = scaler.transform(x)
x_scaled

In [None]:
m,n = x_scaled.shape
x_scaled = np.append(arr=np.ones((m,1)),values=x_scaled,axis=1)
x_scaled

In [None]:
x=x_scaled

### 80-20 Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_21 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_22 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_21)
print("R^2 value: ",test_set_r2_22)

- This is a fairly good regression model as the R square score is near 1.0, and the RMSE error is not very large.

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

- From the graph, we infer that there's positive linear relationship with the regression fit line
- Looking at how the regression line fits in with the scatter plot, some of the actual value points are above the line, and some are below
- This model does not really fit the data well, as there are difference present between many of the datapoints and the best fit line

### 60-40 Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.4,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_23 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_24 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_23)
print("R^2 value: ",test_set_r2_24)

- This is a fairly good regression model as the R square score is near 1.0, and the RMSE error is not very large.
- But the results of 80-20 split are better than this to train the dataset

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

- From the graph, we infer that there's positive linear relationship with the regression fit line
- Looking at how the regression line fits in with the scatter plot, some of the actual value points are above the line, and some are below
- This model does not really fit the data well, as there are difference present between many of the datapoints and the best fit line

## 2nd set : LSTAT and TAX

In [None]:
df1 = df[['LSTAT','TAX','MEDV']]
df1.head()

In [None]:
sns.pairplot(data=df1)

Observations:
- LSTAT shows quite a negatively skewed graph
- MEDV has a normally distributed graph with outliers present between the range 40-50
- Negative Linear correlation is present between LSTAT and MEDV. There are a few outliers present near 50
- There is no relation between MEDV and TAX
- Normal Distribution is not present in the graph of TAX

In [None]:
df1.describe().round(2)

Observations:
- We can see count of entries for each variable is
same i.e. 506.
- Maximum value in MEDV is much higher than 75% of data points

For LSTAT & MEDV:
- The difference between the min and 50% quartile, and between 50% and max value is unequal
- The mean and 50% value is nearly equal

For TAX:
- There is a significant difference between the mean and 50% quartile value
- The difference between the min and 50% quartile, and between 50% and max value is almost equal
- Hence TAX does not have normal distribution

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Dependent variable MEDV
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.MEDV)
plt.title('Box Plot of MEDV')

plt.subplot(1,3,2)
sns.distplot(a=df1.MEDV)
plt.title('Distribution Plot of MEDV')

plt.subplot(1,3,3)
sns.scatterplot(df1.MEDV,df1.MEDV)
plt.title('Scatter Plot of MEDV vs MEDV')
plt.show()

- MEDV is normally distributed
- It contains some extreme values which could be potential outliers, especially near 50

Hence we have to clean this data by removing outliers

In [None]:
df2 = df1[~(df1['MEDV']==50)]
df2

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of MEDV column is 48.80
- Hence we have deleted 16 (506-490) rows from out dataset having MEDV value as 50

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Independent variable LSTAT
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.LSTAT)
plt.title('Box Plot of LSTAT')

plt.subplot(1,3,2)
sns.distplot(a=df1.LSTAT)
plt.title('Distribution Plot of LSTAT')

plt.subplot(1,3,3)
sns.scatterplot(df1.LSTAT,df1.MEDV)
plt.title('Scatter Plot of LSTAT vs MEDV')
plt.show()

- Web observe that th egraph of LSTAT is negatuvely skewed, with outliers present after approximately 31 value

In [None]:
df2 = df2[~(df1['LSTAT']>31)]
df2

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of LSTAT column is 30.81
- Hence we have deleted 23 (506-483) rows from out dataset having LSTAT value as >31

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for TAX
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.TAX)
plt.title('Box Plot of TAX')

plt.subplot(1,3,2)
sns.distplot(a=df1.TAX)
plt.title('Distribution Plot of TAX')

plt.subplot(1,3,3)
sns.scatterplot(df1.TAX,df1.MEDV)
plt.title('Scatter Plot of TAX vs MEDV')
plt.show()

Observations:
- Graph of TAX is NOT normally distributed
- Though Boxplot does not show any outlier but there are some extreme TAX values in the dataset
- From the scatter plot we can observe that for these extreme TAX values, and MEDV ranges from low to high.

In [None]:
temp_df = df2[df1['TAX']>600]
temp_df.shape

- There are total 126 entries in TAX mostly having value 666

In [None]:
temp_df.describe()

Observations:
- LSTAT for these entries lies between5.29 to 30.81
- MEDV for these entries lies between 5.00 to 29.80.
- It seems impossible to have such high TAX values for all these houses.
- These values most likely missing values which were imputed casually by someone

Hence, we cannot remove so many values from our dataset

In [None]:
#Now we split our dataset into Dependent variable and Independent variable
x = df2.iloc[:,0:2].values
y = df2.iloc[:,-1:].values

In [None]:
print("Shape of Independent variable, x :",x.shape)
print("Shape of Dependent variable, y :",y.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)

In [None]:
x

In [None]:
x_scaled = scaler.transform(x)
x_scaled

In [None]:
m,n = x_scaled.shape
x_scaled = np.append(arr=np.ones((m,1)),values=x_scaled,axis=1)
x_scaled

In [None]:
x= x_scaled

### 80-20 Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_31 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_32 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_31)
print("R^2 value: ",test_set_r2_32)

- The R square score is not near 1.0
- The RMSE error is quite large
- Hence this is not a great regresstion model

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

- From the graph, we infer that there's positive linear relationship with the regression fit line
- Looking at how the regression line fits in with the scatter plot, some of the actual value points are above the line, and some are below
- This model does not really fit the data well, as there are difference present between many of the datapoints and the best fit line

### 60-40 Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.4,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_33 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_34 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_33)
print("R^2 value: ",test_set_r2_34)

- The R square score is not near 1.0
- The RMSE error is quite large
- Hence this is not a great regression model
- We also infer that 80-20 split of the dataset showed better results for this set

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

- From the graph, we infer that there's positive linear relationship with the regression fit line
- Looking at how the regression line fits in with the scatter plot, some of the actual value points are above the line, and some are below
- This model does not really fit the data well, as there are difference present between many of the datapoints and the best fit line

## 3rd set : RM and PTRATIO

In [None]:
df1 = df[['RM','PTRATIO','MEDV']]
df1.head()

In [None]:
sns.pairplot(data=df1)

Observations:
- RM is normally distributed as it's histogram is a bell shaped curve, but there are very few outliers towards both the ends
- PTRATIO does not show normal distribution. Graph of PTRATIO is positively skewed with presence of few outliers
- MEDV has a normally distributed graph with outliers present between the range 40-50
- Positive Linear correlation is present between RM and MEDV. There are a few outliers present near 50
- There is no relation between MEDV and PTRATIO

In [None]:
df1.describe().round(2)

Observations:
- We can see count of entries for each variable is
same i.e. 506.
- Maximum value in MEDV is much higher than 75% of data points

For RM & MEDV:
- The difference between the min and 50% quartile, and between 50% and max value is almost the equal
- The mean and 50% value is approximately same
- Hence RM and MEDV have Normal Distribution for their graphs

For PTRATIO:
- The difference between the min and 50% quartile, and between 50% and max value is unequal
- The difference between the mean and 50% quartile value is small
- Hence PTRATIO does not have normal distribution

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Dependent variable MEDV
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.MEDV)
plt.title('Box Plot of MEDV')

plt.subplot(1,3,2)
sns.distplot(a=df1.MEDV)
plt.title('Distribution Plot of MEDV')

plt.subplot(1,3,3)
sns.scatterplot(df1.MEDV,df1.MEDV)
plt.title('Scatter Plot of MEDV vs MEDV')
plt.show()

- MEDV is normally distributed
- It contains some extreme values which could be potential outliers, especially near 50

Hence we have to clean this data by removing outliers

In [None]:
df2 = df1[~(df1['MEDV']==50)]
df2

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of MEDV column is 48.80
- Hence we have deleted 16 (506-490) rows from out dataset having MEDV value as 50

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Independent variable RM
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.RM)
plt.title('Box Plot of RM')

plt.subplot(1,3,2)
sns.distplot(a=df1.RM)
plt.title('Distribution Plot of RM')

plt.subplot(1,3,3)
sns.scatterplot(df1.RM,df1.MEDV)
plt.title('Scatter Plot of RM vs MEDV')
plt.show()

Observations:
- Graph of RM is normally distributed
- There are some outliers present lower and higher end of RM values in the dataset
- Scatter plot of RM vs MEDV show fairly good Positive Linear Relationship.

In [None]:
df2 = df2[~(df1['RM']>7.7)]

In [None]:
df2 = df2[~(df1['RM']<4.7)]

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of RM column is 7.69
- Hence we have deleted 36 (506-470) rows from out dataset having RM value as >7.7 & < 4.7

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for PTRATIO
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.PTRATIO)
plt.title('Box Plot of PTRATIO')

plt.subplot(1,3,2)
sns.distplot(a=df1.PTRATIO)
plt.title('Distribution Plot of PTRATIO')

plt.subplot(1,3,3)
sns.scatterplot(df1.PTRATIO,df1.MEDV)
plt.title('Scatter Plot of PTRATIO vs MEDV')
plt.show()

In [None]:
temp_df = df2[df1['PTRATIO']<13]
temp_df.shape

In [None]:
df2 = df2[~(df1['PTRATIO']<13)]

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of PTRATIO column is 22
- Hence we have deleted 39 (506-467) rows from out dataset having PTRATIO value as < 13

In [None]:
#Now we split our dataset into Dependent variable and Independent variable
x = df2.iloc[:,0:2].values
y = df2.iloc[:,-1:].values

In [None]:
print("Shape of Independent variable, x :",x.shape)
print("Shape of Dependent variable, y :",y.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)

In [None]:
x

In [None]:
x_scaled = scaler.transform(x)
x_scaled

In [None]:
m,n = x_scaled.shape
x_scaled = np.append(arr=np.ones((m,1)),values=x_scaled,axis=1)
x_scaled

In [None]:
x=x_scaled

### 80-20 Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_41 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_42 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_41)
print("R^2 value: ",test_set_r2_42)

- The R square score is approx. 0.5 and it's not near 1.0
- The RMSE error is large
- Hence this is not a good regression model

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

- We infer that there's positive linear relationship with the regression fit line
- From the graph, some of the actual value points are above the line, and some are below the regression best fit line.
- This model does not fit the data well, as there are difference present between many of the datapoints and the best fit line

### 60-40 Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.4,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_43 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_44 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_43)
print("R^2 value: ",test_set_r2_44)

- The R square score is approx. 0.5 and it's not near 1.0
- The RMSE error is large
- Hence this is not a good regression model

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

- We infer that there's positive linear relationship with the regression fit line
- From the graph, some of the actual value points are above the line, and some are below the regression best fit line.
- This model does not fit the data well, as there are difference present between many of the datapoints and the best fit line

## 4th set : TAX, LSTAT and PTRATIO

In [None]:
df1 = df[['TAX','LSTAT','PTRATIO','MEDV']]
df1.head()

In [None]:
sns.pairplot(data=df1)

Observations:
- Normal Distribution is not present in the graph of TAX
- LSTAT shows quite a negatively skewed graph
- Graph of PTRATIO is positively skewed with presence of few outliers
- MEDV has a normally distributed graph with outliers present between the range 40-50
- Negative Linear correlation is present between LSTAT and MEDV. There are a few outliers present near 50
- There is no strong relationship between any other pair of features


In [None]:
df1.describe().round(2)

Observations:
- We can see count of entries for each variable is
same i.e. 506.
- Maximum value in MEDV is much higher than 75% of data points
- The mean and 50% value is nearly equal in value for LSTAT, PTRATIO & MEDV

For LSTAT & MEDV:
- The difference between the min and 50% quartile, and between 50% and max value is unequal

For TAX:
- There is a significant difference between the mean and 50% quartile value
- The difference between the min and 50% quartile, and between 50% and max value is almost equal
- Hence TAX does not have normal distribution

For PTRATIO:
- The difference between the min and 50% quartile is greater than, that present between 50% and max value.
- Hence, the graph of PTRATIO is positively skewed

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Dependent variable MEDV
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.MEDV)
plt.title('Box Plot of MEDV')

plt.subplot(1,3,2)
sns.distplot(a=df1.MEDV)
plt.title('Distribution Plot of MEDV')

plt.subplot(1,3,3)
sns.scatterplot(df1.MEDV,df1.MEDV)
plt.title('Scatter Plot of MEDV vs MEDV')
plt.show()

- MEDV is normally distributed
- It contains some extreme values which could be potential outliers, especially near 50

Hence we have to clean this data by removing outliers

In [None]:
df2 = df1[~(df1['MEDV']==50)]
df2

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of MEDV column is 48.80
- Hence we have deleted 16 (506-490) rows from out dataset having MEDV value as 50

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Independent variable LSTAT
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.LSTAT)
plt.title('Box Plot of LSTAT')

plt.subplot(1,3,2)
sns.distplot(a=df1.LSTAT)
plt.title('Distribution Plot of LSTAT')

plt.subplot(1,3,3)
sns.scatterplot(df1.LSTAT,df1.MEDV)
plt.title('Scatter Plot of LSTAT vs MEDV')
plt.show()

- Web observe that th egraph of LSTAT is negatuvely skewed, with outliers present after approximately 31 value

In [None]:
df2 = df2[~(df1['LSTAT']>31)]
df2

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of LSTAT column is 30.81
- Hence we have deleted 23 (506-483) rows from out dataset having LSTAT value as >31

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for TAX
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.TAX)
plt.title('Box Plot of TAX')

plt.subplot(1,3,2)
sns.distplot(a=df1.TAX)
plt.title('Distribution Plot of TAX')

plt.subplot(1,3,3)
sns.scatterplot(df1.TAX,df1.MEDV)
plt.title('Scatter Plot of TAX vs MEDV')
plt.show()

Observations:
- Graph of TAX is NOT normally distributed
- Though Boxplot does not show any outlier but there are some extreme TAX values in the dataset
- From the scatter plot we can observe that for these extreme TAX values, and MEDV ranges from low to high.

In [None]:
temp_df = df2[df1['TAX']>600]
temp_df.shape

In [None]:
temp_df.describe()

Observations:
- almost 126 rows have TAX value as 666
- LSTAT for these entries lies between5.29 to 30.81
- MEDV for these entries lies between 5.00 to 29.80
- It seems impossible to have such high TAX values for all these houses
- These values most likely missing values which were imputed casually by someone

Hence, we cannot remove so many values from our dataset

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for PTRATIO
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.PTRATIO)
plt.title('Box Plot of PTRATIO')

plt.subplot(1,3,2)
sns.distplot(a=df1.PTRATIO)
plt.title('Distribution Plot of PTRATIO')

plt.subplot(1,3,3)
sns.scatterplot(df1.PTRATIO,df1.MEDV)
plt.title('Scatter Plot of PTRATIO vs MEDV')
plt.show()

In [None]:
temp_df = df2[df1['PTRATIO']<13]
temp_df.shape

In [None]:
df2 = df2[~(df1['PTRATIO']<13)]

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of PTRATIO column is 22
- Hence we have deleted 39 (506-467) rows from out dataset having PTRATIO value as < 13

In [None]:
#Now we split our dataset into Dependent variable and Independent variable
x = df2.iloc[:,0:3].values
y = df2.iloc[:,-1:].values

In [None]:
print("Shape of Independent variable, x :",x.shape)
print("Shape of Dependent variable, y :",y.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)

In [None]:
x

In [None]:
x_scaled = scaler.transform(x)
x_scaled

In [None]:
m,n = x_scaled.shape
x_scaled = np.append(arr=np.ones((m,1)),values=x_scaled,axis=1)
x_scaled

In [None]:
x = x_scaled

### 80-20 Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_51 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_52 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_51)
print("R^2 value: ",test_set_r2_52)

- This is not a great regression model as the R square score is near 1.0, and the RMSE error is not quite large.

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

- We infer that there's positive linear relationship with the regression fit line
- From the graph, some of the actual value points are above the line, and some are below the regression best fit line.
- This model does not fit the data really well, as there are small differences present between many of the datapoints and the best fit line

### 60-40 Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.4,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_53 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_54 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_53)
print("R^2 value: ",test_set_r2_54)

- This is not a great regression model as the R square score is near 1.0, and the RMSE error is not quite large.
- But the results of 80-20 split are better than this to train the dataset

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

## 5th set : RM, LSTAT and PTRATIO

In [None]:
df1 = df[['RM','LSTAT','PTRATIO','MEDV']]
df1.head()

In [None]:
sns.pairplot(data=df1)

Observations:
- RM is normally distributed as it's histogram is a bell shaped curve, but there are very few outliers towards both the ends
- LSTAT shows quite a negatively skewed graph
- PTRATIO has a positively sked graph with few outliers present towards the ends
- MEDV has a normally distributed graph with outliers present between the range 40-50
- Position Linear correlation is present between RM and MEDV. There are a few outliers present near 50
- RM and LSTAT, and MEDV and LSTAT have negative linear relationship between them, alongith the presence of few outliers
- Graph of PTRATIO does not show any relationship with RM, LSTAT or MEDV

In [None]:
#description about this data
df1.describe().round(2)

Observations:
- We can see count of entries for each variable is
same i.e. 506.
- Maximum value in MEDV and LSTAT are much higher than 75% of data points

For RM & MEDV:
- The difference between the min and 50% quartile, and between 50% and max value is almost the equal
- The mean and 50% value is approximately same
- Hence RM and MEDV have Normal Distribution for their graphs

For LSTAT:
- The difference between the min and 50% quartile, and between 50% and max value is unequal
- There is a significant difference between the mean and 50% quartile value
- Hence LSTAT does not have normal distribution

For PTRATIO:
- The difference between the min and 50% quartile is greater than, that present between 50% and max value
- Hence it has a positively sked graph
- Here, the mean and 50% values are also approximately same

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Dependent variable MEDV
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.MEDV)
plt.title('Box Plot of MEDV')

plt.subplot(1,3,2)
sns.distplot(a=df1.MEDV)
plt.title('Distribution Plot of MEDV')

plt.subplot(1,3,3)
sns.scatterplot(df1.MEDV,df1.MEDV)
plt.title('Scatter Plot of MEDV vs MEDV')
plt.show()

- MEDV is normally distributed
- It contains some extreme values which could be potential outliers, especially near 50

Hence we have to clean this data by removing outliers

In [None]:
df2 = df1[~(df1['MEDV']==50)]
df2

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of MEDV column is 48.80
- Hence we have deleted 16 (506-490) rows from out dataset having MEDV value as 50

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Independent variable RM
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.RM)
plt.title('Box Plot of RM')

plt.subplot(1,3,2)
sns.distplot(a=df1.RM)
plt.title('Distribution Plot of RM')

plt.subplot(1,3,3)
sns.scatterplot(df1.RM,df1.MEDV)
plt.title('Scatter Plot of RM vs MEDV')
plt.show()

Observations:
- Graph of RM is normally distributed
- There are some outliers present lower and higher end of RM values in the dataset
- Scatter plot of RM vs MEDV show good Positive Linear Relationship.

In [None]:
temp_df = df2[df1['RM']>7.7]
temp_df.shape

In [None]:
temp_df1 = df2[df1['RM']<4.7]
temp_df1.shape

In [None]:
df2 = df2[~(df1['RM']>7.7)]

In [None]:
df2 = df2[~(df1['RM']<4.7)]

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of RM column is 7.69
- Hence we have deleted 36 (506-470) rows from out dataset having RM value as >7.7 & < 4.7

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for Independent variable LSTAT
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.LSTAT)
plt.title('Box Plot of LSTAT')

plt.subplot(1,3,2)
sns.distplot(a=df1.LSTAT)
plt.title('Distribution Plot of LSTAT')

plt.subplot(1,3,3)
sns.scatterplot(df1.LSTAT,df1.MEDV)
plt.title('Scatter Plot of LSTAT vs MEDV')
plt.show()

In [None]:
df2 = df2[~(df1['LSTAT']>31)]
df2

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of LSTAT column is 30.81
- Hence we have deleted 40 (506-466) rows from out dataset having LSTAT value as >31

In [None]:
#Box Plot, Distribution Plot and Scatter Plot for PTRATIO
plt.figure(figsize=(20,3))
plt.subplot(1,3,1)
sns.boxplot(df1.PTRATIO)
plt.title('Box Plot of PTRATIO')

plt.subplot(1,3,2)
sns.distplot(a=df1.PTRATIO)
plt.title('Distribution Plot of PTRATIO')

plt.subplot(1,3,3)
sns.scatterplot(df1.PTRATIO,df1.MEDV)
plt.title('Scatter Plot of PTRATIO vs MEDV')
plt.show()

In [None]:
df2 = df2[~(df1['PTRATIO']<13)]

In [None]:
print(f'The maximum values of the dataset are:\n{df2.max()}')
print(f'Shape of dataset before removing Outliers: {df1.shape}')
print(f'Shape of dataset after removing Outliers: {df2.shape}')

- Now the maximum value of PTRATIO column is 22
- Hence we have deleted 43 (506-463) rows from out dataset having PTRATIO value as < 13

In [None]:
#Now we split our dataset into Dependent variable and Independent variable
x = df2.iloc[:,0:3].values
y = df2.iloc[:,-1:].values

In [None]:
print("Shape of Independent variable, x :",x.shape)
print("Shape of Dependent variable, y :",y.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)

In [None]:
x

In [None]:
x_scaled = scaler.transform(x)
x_scaled

In [None]:
m,n = x_scaled.shape
x_scaled = np.append(arr=np.ones((m,1)),values=x_scaled,axis=1)
x_scaled

In [None]:
x= x_scaled

### 80-20 Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_61 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_62 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_61)
print("R^2 value: ",test_set_r2_62)

- This is a fairly good regression model as the R square score is near 1.0, and the RMSE error is comparitively not very large.

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

- From the graph, we infer that there's positive linear relationship with the regression fit line
- Looking at how the regression line fits in with the scatter plot, some of the actual value points are above the line, and some are below
- But overall, this model fits the data well, as there is fairly small difference between majority of the datapoints and the best fit line

### 60-40 Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.4,random_state = 42)
print(f"Shape of x_train = {x_train.shape}")
print(f"Shape of x_test = {x_test.shape}")
print(f"Shape of y_train = {y_train.shape}")
print(f"Shape of y_test = {y_test.shape}")

In [None]:
from sklearn import datasets, linear_model #Import datasets and linear_model from Sklearn
from sklearn.metrics import mean_squared_error, r2_score #Import metrics to evaluate the model
lin_reg_mod = linear_model.LinearRegression() #Create LinearRegression object
lin_reg_mod.fit(x_train, y_train) #Fit the model to data (training part)

In [None]:
pred = lin_reg_mod.predict(x_test) #Make Prediction for test (unseen) data
test_set_rmse_63 = (np.sqrt(mean_squared_error(y_test, pred))) #Create metrics for accuracy
test_set_r2_64 = r2_score(y_test, pred)
print("RMSE value:",test_set_rmse_63)
print("R^2 value: ",test_set_r2_64)

- This is a fairly good regression model as the R square score is near 1.0, and the RMSE error is comparitively not very large.

In [None]:
plt.scatter(y_test,pred)
plt.plot([0,55], [0,55], ls="-", c=".3")

- We observe that there's positive linear relationship with the regression fit line
- Some of the actual value points are above the line, and some are below, looking at how the regression line fits in with the scatter plot, 
- But overall, this model fits the data fairly well, as there is small difference between majority of the datapoints and the best fit line

## CONCLUSION

### Training Set (80% of total data) & Test Set (20% of total data)

In [None]:
#For 80-20 Split of the dataset
models = pd.DataFrame({
    'Model Features': ['RM + LSTAT', 'RM + TAX', 'LSTAT + TAX', 'PTRATIO + RM', 'TAX + LSTAT + PTRATIO', 'PTRATIO + RM + LSTAT'],
    'RMSE Score': [ test_set_rmse_11 , test_set_rmse_21 , test_set_rmse_31 , test_set_rmse_41 , test_set_rmse_51 , test_set_rmse_61 ],
    'R-squared Score': [ test_set_r2_12 , test_set_r2_22 , test_set_r2_32 , test_set_r2_42 , test_set_r2_52 , test_set_r2_62]})
models.sort_values(by='RMSE Score', ascending=True)

- We know that R-squared is a relative measure of fit, RMSE is an absolute measure of fit for a model.
- RMSE measures how accurately the model predicts the response, thus it is the most important criterion for fit if the main purpose of the model is prediction.

**Hence from the above results, we conclude that for 80-20 Split of the dataset:**
- The model set using the features *PTRATIO, RM and LSTAT* gives the greatest R-Square Score and the Least RMSE Error Score. Hence this is the **best model**, out of all the sets observed in this experiment.
- Following this, *RM and LSTAT* give the best results i.e. the second highest R-squared score and the second lowest RMSE score, out of all the sets observed in this experiment.
- *RM and TAX* is also a good set as it gives good RMSE and R-squared score comparitively to other sets
- *PTRATIO and RM* does not give good results, as it has a very high RMSE value and a very low R-square score. Hence it would not be a good model for prediction purposes

### Training Set (60% of total data) & Test Set (40% of total data)

In [None]:
#For 60-40 Split of the dataset
models = pd.DataFrame({
    'Model Features': ['RM + LSTAT', 'RM + TAX', 'LSTAT + TAX', 'PTRATIO + RM', 'TAX + LSTAT + PTRATIO', 'PTRATIO + RM + LSTAT'],
    'RMSE Score': [ test_set_rmse_13 , test_set_rmse_23 , test_set_rmse_33 , test_set_rmse_43 , test_set_rmse_53 , test_set_rmse_63 ],
    'R-squared Score': [ test_set_r2_14 , test_set_r2_24 , test_set_r2_34 , test_set_r2_44 , test_set_r2_54 , test_set_r2_64]})
models.sort_values(by='RMSE Score', ascending=True)

**From the above results, we conclude that for 60-40 Split of the dataset:**
- The model set using the features *PTRATIO, RM and LSTAT* gives the greatest R-Square Score and the Least RMSE Error. Hence this is the **best model**, out of all the sets observed in this experiment.
- Following this, *RM and TAX* give good results as it has fairly high R-squared score and low RMSE score, out of all the sets observed in this experiment.
- *RM and LSTAT* is also a good set as it gives good RMSE and R-squared score comparitively to other sets observed.
- *PTRATIO and RM* does not give good results, hence it would not be a good model for prediction purposes. This is because it has a very high RMSE value and a very low R-square score.



---


---





 **CONCLUSION:**
- In this experiment we observe that when the dataset is split in 80:20 ratio, the models give better results than when it is split in 60:40 ratio to train the model on the Machine Learning Algorithm
- The set of PTRATIO, RM and LSTAT gives the best results for Logistic Regression in both cases, followed by the sets RM+TAX and PM+LSTAT.
- And the set of PTRATIO and RM gives the worst results in both cases.