In [97]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [98]:
cd '/content/drive/MyDrive/Data Analytics/Datasets'

/content/drive/MyDrive/Data Analytics/Datasets


In [99]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

boston = pd.read_csv("boston.csv", header = None, delimiter = r"\s+", names = column_names)

boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


## Splitting features from the target variable

In [100]:
x = boston.iloc[:, :13]
y = pd.DataFrame(boston.iloc[:, 13], columns = ['MEDV'])

In [101]:
x.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [102]:
y.head()

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


## Splitting Training and Testing Data

In [103]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)

## Initializing and fitting data to Linear Regression model

In [104]:
model = LinearRegression()
model.fit(x_train, y_train)

## Predicting for x_test

In [105]:
y_predict = model.predict(x_test)

## Analyzing model scores

In [106]:
test_score = model.score(x_test, y_test)
train_score = model.score(x_train, y_train)
print("Score on training data: ", test_score)
print("Score in testing data: ", train_score)

Score on training data:  0.6354638433202128
Score in testing data:  0.7697699488741149


## Adding new features

In [107]:
boston_engg = boston.iloc[:, :-1].copy()
boston_engg.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [108]:
new_columns = column_names[:-1].copy()
print(len(column_names))
print(len(new_columns))

14
13


## Generating new columns and adding to new dataframe

In [109]:
#length of columns
n = len(new_columns)

#generating new column data from original df - boston, and adding to new dataframe - boston_engg
for i in range(n):
  for j in range(i, n):
    col = new_columns[i] + '_' + new_columns[j]
    boston_engg[col] = boston[new_columns[i]] * boston[new_columns[j]]

#Adding Traget Variable to engineered DataFrame
boston_engg['MEDV'] = boston['MEDV']

## Splitting new dataframe into feature dataset and target variable

In [110]:
x2 = boston_engg.iloc[:, :-1]
y2 = pd.DataFrame(boston_engg.iloc[:, -1])

In [111]:
x2.head()



Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,...,TAX_TAX,TAX_PTRATIO,TAX_B,TAX_LSTAT,PTRATIO_PTRATIO,PTRATIO_B,PTRATIO_LSTAT,B_B,B_LSTAT,LSTAT_LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,...,87616.0,4528.8,117482.4,1474.08,234.09,6072.57,76.194,157529.61,1976.562,24.8004
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,...,58564.0,4307.6,96049.8,2211.88,316.84,7064.82,162.692,157529.61,3627.666,83.5396
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,...,58564.0,4307.6,95064.86,975.26,316.84,6992.374,71.734,154315.4089,1583.1049,16.2409
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,...,49284.0,4151.4,87607.86,652.68,349.69,7379.581,54.978,155732.8369,1160.2122,8.6436
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,...,49284.0,4151.4,88111.8,1183.26,349.69,7422.03,99.671,157529.61,2115.477,28.4089


In [112]:
y2.head()

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


## Splitting testing and training data

In [113]:
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, random_state = 0)

## Initializing and fitting data to Linear Regression model

In [114]:
model = LinearRegression()
model.fit(x2_train, y2_train)

## Predicting for x_test

In [115]:
y2_predict = model.predict(x2_test)

## Analyzing model scores

In [116]:
test_score_2 = model.score(x2_test, y2_test)
train_score_2 = model.score(x2_train, y2_train)
print("Score on training data: ", test_score_2)
print("Score in testing data: ", train_score_2)

Score on training data:  0.6074721959846032
Score in testing data:  0.9520519609032729


# Comparing Test Score and Train Scores with original dataset and dataset with added features

## Original dataset

In [117]:
print("Score on training data: ", test_score)
print("Score in testing data: ", train_score)

Score on training data:  0.6354638433202128
Score in testing data:  0.7697699488741149


## Engineered Dataset

In [118]:
print("Score on training data: ", test_score_2)
print("Score in testing data: ", train_score_2)

Score on training data:  0.6074721959846032
Score in testing data:  0.9520519609032729
