In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

data=pd.read_csv("housing.csv",names=column_names,delimiter=r"\s+")#r"\s+" means split and arrange the data wherever theres tabs, space 
print(data.head())
print(data.shape)
print(data.dtypes)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
fig,axs=plt.subplots(ncols=7,nrows=2, figsize=(20,10))
index=0
axs=axs.flatten()
for k,v in data.items():
    sns.boxplot(y=k,data=data,ax=axs[index])
    index+=1
plt.tight_layout(pad=0.4,w_pad=0.5,h_pad=5.0)#observation of the data spreadings


In [None]:
for k,v in data.items():
    q1=v.quantile(0.25)
    q3=v.quantile(0.75)
    iqr=q3-q1
    v_col = v[(v <= q1 - 1.5 * iqr) | (v >= q3 + 1.5 * iqr)]#takes the data inside the upper and lower threshold (outliers gone)
    perc = np.shape(v_col)[0] * 100.0 / np.shape(data)[0]
    print("Column %s outliers = %.2f%%" % (k, perc))



In [None]:
fig, axs = plt.subplots(ncols=7, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in data.items():
    sns.histplot(v,ax=axs[index])
    index += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)#histogram for the data


In [None]:
data = data[~(data['MEDV'] >= 50.0)]
print(np.shape(data))#removal of the outliers of the target function bc sometimes the house may be unusually expensive

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(data.corr().abs(),  annot=True)#.abs() to only get the value above 0 bc corr is range(-1 to 1)

In [None]:
from sklearn import preprocessing
#compare each feat with the target feature i.e MEDV
min_max_scaler=preprocessing.MinMaxScaler()
column_sels=['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']
x = data.loc[:,column_sels]#to remove the name of the name of features as loc accesses the row and columns
y=data['MEDV']
X=pd.DataFrame(data=min_max_scaler.fit_transform(x),columns=column_sels)
fig, axs = plt.subplots(ncols=4, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for i, k in enumerate(column_sels):
    sns.regplot(y=y, x=x[k], ax=axs[i])
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)



In [None]:
l_y=np.log1(y)
for  col in x.columns():#iterate through all x cols for skewness search
    if np.abs(x[col].skew() > 0.3):#threshold is 0.3 for skewness of column
        x[col]=np.log1(x[col])#reduce skewness by implementing log trans (add 1 for convinience)
        




In [None]:
from sklearn import linear_model,datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
l_regr=linear_model.LinearRegression()
Kf=KFold(n_splits=10)#does cross vali
min_max_scaler=preprocessing.MinMaxScaler()
x_scaled=min_max_scaler.fit_transform(x)
scores=cross_val_score(l_regr,x_scaled,y,cv=Kf,scoring="neg_mean_squared_error")
print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))


In [None]:
scores_map={}
scores_map["LinearRegression"]=scores
l_ridge=linear_model.Ridge()
scores=cross_val_score(l_ridge,x_scaled,y,cv=Kf,scoring="neg_mean_squared_error")
scores["Ridge"]=scores
print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
for degree in range(2, 6):
   model = make_pipeline(PolynomialFeatures(degree=degree), linear_model.Ridge())
   scores = cross_val_score(model, x_scaled, y, cv=Kf, scoring='neg_mean_squared_error')
   print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
   #leave the degree selection logic for tmrw
   model=make_pipeline(PolynomialFeatures(degree=3),linear_model.Ridge())
   scores=cross_val_score(model,x_scaled,y,cv=Kf,scoring="neg_mean_squared_error")
   print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))


      
   

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) #rbf changes the dimensional_space of the data to find pattern
#grid_sv = GridSearchCV(svr_rbf, cv=kf, param_grid={"C": [1e0, 1e1, 1e2, 1e3], "gamma": np.logspace(-2, 2, 5)}, scoring='neg_mean_squared_error')
#grid_sv.fit(x_scaled, y)
#print("Best classifier :", grid_sv.best_estimator_)
scores = cross_val_score(svr_rbf, x_scaled, y, cv=Kf, scoring='neg_mean_squared_error')
scores_map['SVR'] = scores
print("MSE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
from sklearn.tree import DecisionTreeRegressor
desc_tr=DecisionTreeRegressor(max_depth=5)
scores=cross_val_score(desc_tr,x_scaled,y,cv=Kf,scoring="neg_mean_squared_error")
scores_map[DecisionTreeRegressor]=scores
print("MSE : %0.2f(+/- %0.2f )" % (scores.mean(),scores.std()))


In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn=KNeighborsRegressor(n_neighbors=7)
scores=cross_val_score(knn,x_scaled,y,cv=Kf,scoring="neg_mean_sqaured_error")
scores_map["KNeighborsRegressor"]=scores#acts as a container for having scores of different algo for ensemble to select
print("MSE : %0.2f (+/- %0.2f ) " %(scores.mean(),scores.std()))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
grb = GradientBoostingRegressor(alpha=0.9,learning_rate=0.05, max_depth=2, min_samples_leaf=5, min_samples_split=2, n_estimators=100, random_state=30)
scores=cross_val_score(grb,x_scaled,y,cv=Kf,scoring="neg_mean_squared_error")
scores_map["GradientBoostingRegressor"]=scores
print("MSE : %0.2f (+/- %0.2f)" % (scores.mean(),scores.std()))


In [None]:
#now finally the display 
plt.figure(figsize=(20,10))
scores_map=pd.DataFrame(scores)
