In [None]:
!pip install catboost

In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_wine
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import randint

In [2]:
data = load_wine().data
df = pd.DataFrame(data, columns=load_wine().feature_names)

In [3]:
X = df.drop('alcohol', axis=1)
y = df['alcohol']

In [4]:
# check duplicates
X.duplicated().sum()



0

In [5]:
# check null values
X.isna().sum().sum()


0

In [10]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    shuffle=True,
                                                    test_size=0.2,
                                                    random_state=2024)


In [11]:
# Scale features
mms = MinMaxScaler()
mms.fit(X_train)
X_train = mms.transform(X_train)
# X_train = mms.fit_transform(X_train)
X_test = mms.transform(X_test)

In [8]:
# polynomial features
degrees = [1, 2]
for deg in degrees:
  poly = PolynomialFeatures(degree=deg)
  X_train = poly.fit_transform(X_train)
  X_test = poly.transform(X_test)
  # train  the model
  lr = LinearRegression()
  lr.fit(X_train, y_train)
  # predict the unseen data
  y_pred = lr.predict(X_test)
  # evaluate the model
  print(f'degree {deg}: ', f'r2 = {r2_score(y_test, y_pred)}')

degree 1:  r2 = 0.5561529982521582
degree 2:  r2 = -4.032679489848889


In [10]:
# train  the model
lr = LinearRegression()
lr.fit(X_train, y_train)


In [11]:
# predict the unseen data
y_pred = lr.predict(X_test)


In [12]:
# evaluate the model
r2_score(y_test, y_pred)

-0.4508976450310733

In [13]:
# root mean square error
np.sqrt(mean_squared_error(y_test, y_pred))

0.8950649332034979

In [29]:
# MSE
mean_squared_error(y_test, y_pred)

0.18571608706852405

In [21]:
## KNN regression
n_neighbors = np.arange(1,10)
for n in n_neighbors:
  knnr = KNeighborsRegressor(n_neighbors=n)
  knnr.fit(X_train, y_train)
  y_pred = knnr.predict(X_test)
  r2_score(y_test, y_pred)
  print(f'neighbors {n}: ', f'r2 = {r2_score(y_test, y_pred)}')

neighbors 1:  r2 = 0.5425517896232803
neighbors 2:  r2 = 0.6434065131709035
neighbors 3:  r2 = 0.6270272347495816
neighbors 4:  r2 = 0.6722796866735656
neighbors 5:  r2 = 0.6582367052513836
neighbors 6:  r2 = 0.673223529806327
neighbors 7:  r2 = 0.6906706148432185
neighbors 8:  r2 = 0.6759758874053867
neighbors 9:  r2 = 0.6535494468716617


In [37]:
dist = dict(metric=['euclidean', 'manhattan'],
            weights=['uniform', 'distance'],
            n_neighbors=randint(1,20))
random_search = RandomizedSearchCV(
    knnr,
    param_distributions=dist,
    n_iter=50,
    random_state=42)

In [38]:
random_search.fit(X_train, y_train)
random_search.best_params_

  _data = np.array(data, dtype=dtype, copy=copy,


{'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}

In [39]:
y_pred = random_search.best_estimator_.predict(X_test)
r2_score(y_test, y_pred)

0.6989294551513863

In [41]:
# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

In [43]:
y_pred = dt.predict(X_test)

array([13.32, 13.94, 13.58, 14.23, 13.9 , 13.86, 12.86, 11.82, 13.17,
       12.  , 12.64, 12.6 , 12.08, 13.75, 12.67, 12.07, 14.06, 13.58,
       13.24, 12.77, 12.64, 13.88, 12.07, 12.47, 12.6 , 13.4 , 12.2 ,
       13.83, 12.36, 12.72, 12.33, 13.05, 13.75, 12.87, 13.68, 13.49])

In [45]:
r2_score(y_test, y_pred)

0.10991983778907977

In [64]:
# random forest
rfr = RandomForestRegressor(max_depth=20, n_estimators=500, random_state=42)
rfr.fit(X_train, y_train)

In [65]:
y_pred_train = rfr.predict(X_train)
r2_score(y_train, y_pred_train)

0.9367533529260512

In [62]:
y_pred = rfr.predict(X_test)

In [66]:
r2_score(y_test, y_pred)

0.6232498548730825

In [83]:
# xgboost
xgbr = XGBRegressor(n_estimator=1000, learning_rate=0.5, random_state=42)
xgbr.fit(X_train, y_train)

Parameters: { "n_estimator" } are not used.



In [84]:
y_pred = xgbr.predict(X_test)
r2_score(y_test, y_pred)

0.6259591422825508

In [90]:
# catboost
cbr = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=10, verbose=0)
cbr.fit(X_train, y_train)


<catboost.core.CatBoostRegressor at 0x7cf1d9985000>

In [91]:
y_pred = cbr.predict(X_test)
r2_score(y_test, y_pred)

0.6667898695626036