In [None]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_recall_fscore_support
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

In [None]:
data = pd.read_csv('/kaggle/input/the-boston-houseprice-data/boston.csv');

In [None]:
data

In [None]:
plt.figure(figsize=(10,8))
heat_map_data = data.corr()
sns.heatmap(heat_map_data, annot=True)

> Here `RAD` and `TAX` has the max correlation of 0.91, so we can remove either one of them.

In [None]:
data.drop(columns=['TAX', 'DIS'], inplace=True)

In [None]:
data

In [None]:
len(data.columns)

In [None]:
plt.figure(figsize=(10,8))
n_features = len(data.columns)
heat_map_data = (data.corrwith(data.MEDV).values * np.ones((n_features,n_features))) * np.identity(n_features)
g = sns.heatmap(pd.DataFrame(heat_map_data, columns= data.columns, index=['MEDV' for _ in range(len(data.columns))]), annot=True, cmap='coolwarm') 
g.set_yticklabels(g.get_yticklabels(), rotation=45)
plt.show()

> Here the target variable `MEDV` has the least correlation with `CHAS` so it can be removed

In [None]:
data.drop(columns = ['CHAS'], inplace= True)

In [None]:
data

> We will now plot the normal distribution for every feature data.

In [None]:
import warnings
warnings.filterwarnings('ignore')
plt.figure(figsize=(20,15))
for i, column in enumerate(data.columns):
    plt.subplot(5,3,i+1)
    sns.distplot(data[column])

> There are a lot of outliers in columns : `CRIM, ZN, RM, DIS, B, LSTAT`

In [None]:
# Removing extreme outliers and replacing the mild outliers with mean value
for column in data.columns:
    q1 = data[column].quantile(.25)
    q3 = data[column].quantile(.75)
    iqr = q3 - q1
    not_outliers = len(data[column][((q1- (1.5 *iqr)) < data[column]) & ((q3 + (1.5 * iqr)) > data[column])])
    outliers = len(data[column][((q1- (1.5 *iqr)) >= data[column]) | ((q3 + (1.5 * iqr)) <= data[column])])
    print(f"{column} has {((outliers/data[column].count()) * 100):.02f} % of outliers")
    x = data[column][((q1- (1.5 *iqr)) >= data[column]) | ((q3 + (1.5 * iqr)) <= data[column])]
    if len(x) >= 1 and column != 'CHAS':
        data.drop(x.sort_values(ascending=False).index[0], inplace=True)
        data[column][x.sort_values(ascending=False).index[1:]] = data[column].mean()
    print("After updating/removing outliers : ")
    outliers = len(data[column][((q1- (1.5 *iqr)) >= data[column]) | ((q3 + (1.5 * iqr)) <= data[column])])
    print(f"{column} has {((outliers/data[column].count()) * 100):.02f} % of outliers")
    print("_"*50)

In [None]:
fig, ax = plt.subplots(ncols = 7, nrows = 2, figsize = (20, 15))
index = 0
ax = ax.flatten()

for col, value in data.items():
    sns.boxplot(y=col, data=data, ax=ax[index])
    index += 1
plt.tight_layout(pad = 0.5, w_pad=0.7, h_pad=5.0)

In [None]:
data

In [None]:
# sns.boxplot(x= 'CHAS', y='AGE', hue = 'CHAS', data=data, palette='Set3')
# sns.boxplot(x= 'RM', data=data)

In [None]:
# sns.displot(data, x='RM', y='AGE', hue='CHAS')

In [None]:
data_X = data.iloc[:,:-1]
data_y = data.iloc[:,-1]

In [None]:
fig, ax = plt.subplots(ncols = 7, nrows = 2, figsize = (20, 15))
index = 0
ax = ax.flatten()

for col, value in data_X.items():
    sns.boxplot(y=col, data=data_X, ax=ax[index])
    index += 1
plt.tight_layout(pad = 0.5, w_pad=0.7, h_pad=5.0)

In [None]:
# Ordinary Least square
model_ols = LinearRegression().fit(data_X, data_y)

In [None]:
model_ols.score(data_X, data_y)

In [None]:
size, train_scores, valid_scores = learning_curve(model_ols, data_X, data_y)

In [None]:
plt.plot(size, train_scores)
plt.xlabel('Training data size')
plt.ylabel('Score')

In [None]:
plt.plot(size, valid_scores)
plt.xlabel('Validation data size')
plt.ylabel('Score')

In [None]:
plt.plot(size, train_scores[0], label='Training curve',color='red')
plt.plot(size, valid_scores[0],label='Cross validation score', color='green')
plt.legend()
plt.xlabel('Dataset size')
plt.ylabel('Score')

In [None]:
plt.plot(size, train_scores, label='Training curve')
plt.plot(size, valid_scores,label='Cross validation score')
plt.legend()

In [None]:
plt.figure(figsize=(20, 30))

for i,column in enumerate(data_X.columns):
    plt.subplot(5,3, i+1)
    plt.plot(data_X[column], data_y,'o', label=f'{column} actual data')
    # scatter plot for predicted data
    plt.plot(data_X.drop_duplicates(subset=[column])[column],
             model_ols.predict(data_X.drop_duplicates(subset=[column])), 'o', label='predicted price')
    # scatter plot for the given data
    plt.plot(data_X.drop_duplicates(subset=[column])[column], 
             data.drop_duplicates(subset=[column])['MEDV'], 'o', label='actual price')
    # straight line for the given data
    plt.plot(np.unique(data_X[column]), np.poly1d(np.polyfit(data_X[column], data_y, 1))(np.unique(data_X[column])), label='actual price')
    plt.xlabel(column)
    plt.ylabel('Prices')
    plt.legend()

In [None]:
linearly_varying_data = data_X
shuffled_X, shuffled_y = shuffle(linearly_varying_data, data_y)
X_train, X_test, y_train, y_test = train_test_split(shuffled_X, shuffled_y, test_size=0.2, random_state=10)
model_ols = LinearRegression().fit(X_train, y_train)
print(f"Training r2_score {model_ols.score(X_train, y_train)}")

In [None]:
print("Testing data results : ")
rsq_score = r2_score(model_ols.predict(X_test),y_test)
mse = mean_squared_error(model_ols.predict(X_test),y_test)
rmse = mean_squared_error(model_ols.predict(X_test),y_test, squared=False)
mae = mean_absolute_error(model_ols.predict(X_test),y_test)
print(f"R2_Score : {rsq_score} \nMean Squared Error : {mse} \nRoot Mean Squared Error : {rmse}\nMean Absolute Error : {mae}")

In [None]:
x_compare = pd.DataFrame({'Actual':y_train.head(10), 'Predicted':model_ols.predict(X_train.head(10))})
x_compare.plot(kind='bar', title='Ordinary Least Square')

In [None]:
model_SVR = SVR()
model_SVR.fit(X_train, y_train)

In [None]:
model_SVR.score(data_X, data_y)

In [None]:
print("Testing data results : ")
rsq_score = r2_score(model_SVR.predict(X_test),y_test)
mse = mean_squared_error(model_SVR.predict(X_test),y_test)
rmse = mean_squared_error(model_SVR.predict(X_test),y_test, squared=False)
mae = mean_absolute_error(model_SVR.predict(X_test),y_test)
print(f"R2_Score : {rsq_score} \nMean Squared Error : {mse} \nRoot Mean Squared Error : {rmse}\nMean Absolute Error : {mae}")

In [None]:
x_compare = pd.DataFrame({'Actual':y_train.head(10), 'Predicted':model_SVR.predict(X_train.head(10))})
x_compare.plot(kind='bar', title='Support Vector Regressor')

In [None]:
model_rfr = RandomForestRegressor(n_estimators=100)

In [None]:
model_rfr.fit(X_train, y_train)

In [None]:
model_rfr.score(X_train, y_train)

In [None]:
print("Testing data results : ")
rsq_score = r2_score(model_rfr.predict(X_test),y_test)
mse = mean_squared_error(model_rfr.predict(X_test),y_test)
rmse = mean_squared_error(model_rfr.predict(X_test),y_test, squared=False)
mae = mean_absolute_error(model_rfr.predict(X_test),y_test)
print(f"R2_Score : {rsq_score} \nMean Squared Error : {mse} \nRoot Mean Squared Error : {rmse}\nMean Absolute Error : {mae}")

In [None]:
model_rfr.score(X_test, y_test)

In [None]:
size, train_scores, valid_scores = learning_curve(model_rfr, data_X, data_y)

In [None]:
plt.plot(size, train_scores)
plt.xlabel('Training data size')
plt.ylabel('Score')

In [None]:
plt.plot(size, valid_scores)
plt.xlabel('Validation data size')
plt.ylabel('Score')

In [None]:
plt.plot(size, train_scores[0], label='Training curve',color='red')
plt.plot(size, valid_scores[0],label='Cross validation score', color='green')
plt.legend()
plt.xlabel('Dataset size')
plt.ylabel('Score')

In [None]:
plt.plot(size, train_scores, label='Training curve')
plt.plot(size, valid_scores,label='Cross validation score')
# plt.legend()

In [None]:
x_compare = pd.DataFrame({'Actual':y_train.head(10), 'Predicted':model_rfr.predict(X_train.head(10))})
x_compare.plot(kind='bar', title='Random Forest Regressor')

In [None]:
model_knn = KNeighborsRegressor(n_neighbors=10)
model_knn.fit(X_train, y_train)

In [None]:
model_knn.score(X_train, y_train)

In [None]:
print("Testing data results : ")
rsq_score = r2_score(model_knn.predict(X_test),y_test)
mse = mean_squared_error(model_knn.predict(X_test),y_test)
rmse = mean_squared_error(model_knn.predict(X_test),y_test, squared=False)
mae = mean_absolute_error(model_knn.predict(X_test),y_test)
print(f"R2_Score : {rsq_score} \nMean Squared Error : {mse} \nRoot Mean Squared Error : {rmse}\nMean Absolute Error : {mae}")

In [None]:
x_compare = pd.DataFrame({'Actual':y_train.head(10), 'Predicted':model_knn.predict(X_train.head(10))})
x_compare.plot(kind='bar', title='K-Nearest-Neighbour')

In [None]:
test_data = pd.read_csv('/kaggle/input/test-data/test.csv')

In [None]:
test_data

In [None]:
test_data_modified = test_data.drop(columns=['ID', 'tax', 'dis', 'chas'])

In [None]:
pd.DataFrame({'id':test_data.ID, 'medv':model_rfr.predict(test_data_modified)}).to_csv('/kaggle/working/output.csv', index=None)