## handmade-ml: Welcome notebook

### Models
* Linear Regression
* Decision Tree
* Random Forest (Classifier)
* GBT

You can read more about these models inside their folders.

In [1]:
# plots, simple datasets
import seaborn as sns
# plots
import matplotlib as plt
# work with databases
import pandas as pd
# math, vectorized operations
import numpy as np
from metrics import mse, error

from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2

### Linear regression

In [9]:
wine_quality_dataset = pd.read_csv('datasets/winequality-red.csv')
wine_quality_dataset.head()

feature_columns = list(wine_quality_dataset.columns)
feature_columns.remove('quality')
pred_column = 'quality'

X, y = wine_quality_dataset[feature_columns].astype(float), wine_quality_dataset[pred_column].astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
prep = MinMaxScaler()
prep.fit(X_train)
X_train = prep.fit_transform(X_train)
X_test = prep.transform(X_test)


In [10]:
# feature_columns = ['fixed acidity']

In [11]:
from LinearModels.LinearReg import LinearRegression
model = LinearRegression(
    num_iters=1000,
    regularization='lasso',
    learning_rate=1.0,
    reg_lmb=0.0
)
model.fit(np.array(X_train), np.array(y_train))
y_pred = model.predict(np.array(X_test))
mean_absolute_error(y_pred, y_test)

0.6518541092800736

### Decision tree

In [8]:
from Trees.Tree import TreeRegressor
model = TreeRegressor(metric=mean_absolute_error, criterion='entropy', max_depth=5, minimize=True, debug=False)
model.fit(X_train, y_train)
pred = model.predict(X_test, predict_col='target')
mean_absolute_error(pred['target'], y_test)

0.5021684698676662

### GBT

In [6]:
from Boosting.TreeBoost import SimpleTreeBoostRegressor
model = SimpleTreeBoostRegressor(
    n_estimators=10,
    lr=0.1,
    metric=mse,
    derivative=error,
    max_depth=3,
    colsample_bytree=0.8,
    criterion='entropy',
    subsample=0.8,
    minimize=True,
    debug=False,
)
df = pd.DataFrame(X_train)
df['target'] = list(y_train)

model.fit(df, target='target')
# pred = model.predict(X_test, predict_col='pred')
# mean_absolute_error(pred['target'], y_test)

Iteration: 3, Loss: 0.5548046304394244

Iteration: 4, Loss: 0.517176783565545

Iteration: 5, Loss: 0.4930375806110396

Iteration: 6, Loss: 0.48312167424749897

Iteration: 7, Loss: 0.4598877124507926

Iteration: 8, Loss: 0.4586910957712947

Iteration: 9, Loss: 0.4335972170195566



In [7]:
df = pd.DataFrame(X_test)
df['target'] = list(y_test)
model.predict(df, predict_col='pred')
mean_absolute_error(df['target'], df['pred'])

0.5697732394729527