In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('data.csv')
data.head()

In [None]:
#Data preparation
data.info()

In [None]:
for col in data:
    if(data[col].isnull().sum()!=0):
        data[col]=data[col].fillna(data[col].dropna().mean())

In [None]:
data.info()

In [None]:
#Analyzing data

print(sns.scatterplot(data['CRIM'], data['MEDV']))
print(sns.scatterplot(data['ZN'], data['MEDV']))

In [None]:
print(sns.scatterplot(data['INDUS'], data['MEDV']))
print(sns.scatterplot(data['CHAS'], data['MEDV']))

In [None]:
print(sns.scatterplot(data['NOX'], data['MEDV']))
print(sns.scatterplot(data['RM'], data['MEDV']))

In [None]:
print(sns.scatterplot(data['AGE'], data['MEDV']))
print(sns.scatterplot(data['DIS'], data['MEDV']))

In [None]:
#Splitting the data to work with
from sklearn.model_selection import train_test_split

x=data.iloc[:,:-1].values
y=data.iloc[:,-1].values
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)


In [None]:
#Scaliing data
from sklearn.preprocessing import StandardScaler

sc=StandardScaler()
x_train[:,:]=sc.fit_transform(x_train[:,:])
x_test[:,:]=sc.fit_transform(x_test[:,:])

In [None]:
#Fitting the model
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(x_train,y_train) 

In [None]:
from sklearn.metrics import r2_score

y_pred=regressor.predict(x_test)
accuracy=r2_score(y_test, y_pred)
print("Accuracy:",accuracy*100,"%")

In [None]:
print('y_pred','y_test','\n')
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)[:10])

In [None]:
#Trying another model
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor()
regressor.fit(x_train,y_train) 

In [None]:
y_pred=regressor.predict(x_test)
accuracy=r2_score(y_test, y_pred)
print("Accuracy:",accuracy*100,"%")

# The Random Forest worked better
## Now let's check different parameter combinations


In [None]:
regressor_1 = RandomForestRegressor(n_estimators=50, random_state=0)
regressor_2 = RandomForestRegressor(n_estimators=100, random_state=0)
regressor_3 = RandomForestRegressor(n_estimators=100, criterion = 'absolute_error', random_state=0)
regressor_4 = RandomForestRegressor(n_estimators=200, min_samples_split =20, random_state=0)
regressor_5 = RandomForestRegressor(n_estimators=70, max_depth = 7, random_state=0)

models = [regressor_1, regressor_2, regressor_3, regressor_4, regressor_5]

regressor.fit(x_train,y_train) 

In [None]:
def score_model(model, x_t = x_train, x_v = x_test, y_t = y_train, y_v = y_test):
    model.fit(x_t, y_t)
    preds = model.predict(x_v)
    return r2_score(y_v, preds)

for i in range(0, len(models)):
    accuracy = score_model(models[i])
    print("Accuracy:",accuracy*100,"%")

#### Trying XGBoost until deciding the final result

In [None]:
import xgboost as xgb
data_dmatrix = xgb.DMatrix(data=x_train,label=y_train)

In [None]:
#Creating the best parameter model
max = 0.0
for i in np.arange(1, 25, 0.1):
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1, max_depth = 7, max_delta_step = i, random_state = 2)
    xg_reg.fit(x_train,y_train)
    preds = xg_reg.predict(x_test)
    print(r2_score(y_test, preds)*100)
    if r2_score(y_test, preds)*100 > max:
        max = r2_score(y_test, preds)*100
        print('here', i)


### Found the best XGBoost model at 14.3 max delta step 

In [None]:
max = 0.0
for i in range(1, 25):
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1, max_depth = i, max_delta_step = 14.3, random_state = 2)
    xg_reg.fit(x_train,y_train)
    preds = xg_reg.predict(x_test)
    print(r2_score(y_test, preds)*100)
    if r2_score(y_test, preds)*100 > max:
        max = r2_score(y_test, preds)*100
        print('here', i)

### Found the best XGBoost model at 4 max depth

In [None]:
#Adding new parameter
max = 0.0
for i in np.arange(0, 20, 0.1):
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1, max_depth = 4, max_delta_step = 14.3, random_state = 2, gamma = i)
    xg_reg.fit(x_train,y_train)
    preds = xg_reg.predict(x_test)
    print(r2_score(y_test, preds)*100)
    if r2_score(y_test, preds)*100 > max:
        max = r2_score(y_test, preds)*100
        print('here', i)

### Found the best XGBoost model at 2.1 gamma

In [None]:
max = 0.0
for i in np.arange(0, 1, 0.0001):
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1, max_depth = 4, max_delta_step = 14.3, random_state = 2, gamma = 2.1, subsample = i)
    xg_reg.fit(x_train,y_train)
    preds = xg_reg.predict(x_test)
    print(r2_score(y_test, preds)*100)
    if r2_score(y_test, preds)*100 > max:
        max = r2_score(y_test, preds)*100
        print('here', i)

### Found the best XGBoost model at 0.8503 subsample

In [None]:
max = 0.0
for i in np.arange(0, 20, 0.01):
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1, max_depth = 4, max_delta_step = 14.3, random_state = 2, gamma = 2.1, subsample = 0.8503, reg_alpha = i)
    xg_reg.fit(x_train,y_train)
    preds = xg_reg.predict(x_test)
    print(r2_score(y_test, preds)*100)
    if r2_score(y_test, preds)*100 > max:
        max = r2_score(y_test, preds)*100
        print('here', i)

### Found the best XGBoost model at 3.02 reg_alpha

In [None]:
help(xgb.XGBRegressor)

# Conclusion: Best model is this one below with accurnacy of 91.11
#### This are the parameters xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1, max_depth = 4, max_delta_step = 14.3, random_state = 2, gamma = 2.1, subsample = 0.8503, reg_alpha = 3.02)

In [None]:
regressor = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1, max_depth = 4, max_delta_step = 14.3, random_state = 2, gamma = 2.1, subsample = 0.8503, reg_alpha = 3.02)

regressor.fit(x_train,y_train) 

preds = regressor.predict(x_test)

accuracy = r2_score(y_test, preds)

print("Accuracy:",accuracy*100,"%")