In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt  # To visualize
from scipy.optimize import curve_fit # Logarithmic regression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def split_data(data: pd.DataFrame):
    """Split data into training, validation, and test sets.
    https://stackoverflow.com/a/38251213
    returns train, validate, test
    """
    return np.split(data.sample(frac=1, random_state=42), 
                       [int(.6*len(data)), int(.8*len(data))])

In [None]:
def draw_chart(X, Y, X_pred, Y_pred, title="", x_label="", y_label=""):
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.scatter(X, Y)
    plt.plot(X_pred, Y_pred, color='red')
    plt.show()

In [None]:
# -----------------
# LINEAR REGRESSION
# -----------------
data = pd.read_csv('/kaggle/input/housingdata/housingdata.csv')  # load data set
train, validate, test = split_data(data)

COLUMN_NAMES = ["CROM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]
X_COL = 2;
Y_COL = 4;

# Source: https://towardsdatascience.com/linear-regression-in-6-lines-of-python-5e1d0cd05b8d
X = train.iloc[:, X_COL].values.reshape(-1, 1)  # values converts it into a numpy array
Y = train.iloc[:, Y_COL].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression()  # create object for the class
linear_regressor.fit(X, Y)  # perform linear regression
X_pred = X
Y_pred = linear_regressor.predict(X)  # make predictions
draw_chart(X, Y, X_pred, Y_pred, title="Linear regression on train", x_label=COLUMN_NAMES[X_COL], y_label=COLUMN_NAMES[Y_COL])

X = np.concatenate([train.iloc[:, X_COL].values.reshape(-1, 1), validate.iloc[:, X_COL].values.reshape(-1, 1)])
Y = np.concatenate([train.iloc[:, Y_COL].values.reshape(-1, 1), validate.iloc[:, Y_COL].values.reshape(-1, 1)])
X_pred = X
Y_pred = linear_regressor.predict(X)
draw_chart(X, Y, X_pred, Y_pred, title="Linear regression vs. train + validate", x_label=COLUMN_NAMES[X_COL], y_label=COLUMN_NAMES[Y_COL])

X = test.iloc[:, X_COL].values.reshape(-1, 1)
Y = test.iloc[:, Y_COL].values.reshape(-1, 1)
X_pred = X
Y_pred = linear_regressor.predict(X)
draw_chart(X, Y, X_pred, Y_pred, title="Linear regression vs. test", x_label=COLUMN_NAMES[X_COL], y_label=COLUMN_NAMES[Y_COL])

In [None]:
# ----------------------
# LOGARITHMIC REGRESSION
# ----------------------

data = pd.read_csv('/kaggle/input/housingdata/housingdata.csv')  # load data set
train, validate, test = split_data(data)

data = pd.read_csv('/kaggle/input/housingdata/housingdata.csv')  # load data set
train, validate, test = split_data(data)

COLUMN_NAMES = ["CROM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]
X_COL = 2;
Y_COL = 7;

# Source: https://towardsdatascience.com/linear-regression-in-6-lines-of-python-5e1d0cd05b8d
X = train.iloc[:, X_COL].values.reshape(-1, 1)
Y = train.iloc[:, Y_COL].values.reshape(-1, 1)
logarithmic_regressor = np.polyfit(np.log(X.ravel()), Y.ravel(), 1)
X_pred = np.linspace(np.amin(X), np.amax(X), len(X))
Y_pred = logarithmic_regressor[0] * np.log(X_pred) + logarithmic_regressor[1]
draw_chart(X, Y, X_pred, Y_pred, title="Logarithmic regression on train", x_label=COLUMN_NAMES[X_COL], y_label=COLUMN_NAMES[Y_COL])

X = np.concatenate([train.iloc[:, X_COL].values.reshape(-1, 1), validate.iloc[:, X_COL].values.reshape(-1, 1)])
Y = np.concatenate([train.iloc[:, Y_COL].values.reshape(-1, 1), validate.iloc[:, Y_COL].values.reshape(-1, 1)])
logarithmic_regressor = np.polyfit(np.log(X.ravel()), Y.ravel(), 1)
X_pred = np.linspace(np.amin(X), np.amax(X), len(X))
Y_pred = logarithmic_regressor[0] * np.log(X_pred) + logarithmic_regressor[1]
draw_chart(X, Y, X_pred, Y_pred, title="Logarithmic regression vs. train + validate", x_label=COLUMN_NAMES[X_COL], y_label=COLUMN_NAMES[Y_COL])

X = test.iloc[:, X_COL].values.reshape(-1, 1)
Y = test.iloc[:, Y_COL].values.reshape(-1, 1)
logarithmic_regressor = np.polyfit(np.log(X.ravel()), Y.ravel(), 1)
X_pred = np.linspace(np.amin(X), np.amax(X), len(X))
Y_pred = logarithmic_regressor[0] * np.log(X_pred) + logarithmic_regressor[1]
draw_chart(X, Y, X_pred, Y_pred, title="Logarithmic regression vs. test", x_label=COLUMN_NAMES[X_COL], y_label=COLUMN_NAMES[Y_COL])

In [None]:
train

In [None]:
validate

In [None]:
test