In [53]:
!pip install catboost



In [54]:
#!pip install -U scikit-learn


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import randint




In [77]:
data = load_wine().data
df = pd.DataFrame(data, columns=load_wine().feature_names)

In [78]:
X = df.drop('alcohol', axis=1)
y = df['alcohol']

In [5]:
X.shape

(178, 12)

In [6]:
# check duplicates
X.duplicated().sum()

0

In [70]:
# check null values
X.isna().sum().sum()

0

In [None]:
#EDA
X.hist(bins=20,figsize=(10,10))

In [79]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    shuffle=True,
                                                    test_size=0.2,
                                                    random_state=2024)


In [85]:
# Scale features
mms=MinMaxScaler()
mms.fit(X_train)
X_train=mms.transform(X_train)
X_test=mms.transform(X_test)

In [144]:
#StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [89]:
## KNN regression
n_neighbors = np.arange(1,10)
for n in n_neighbors:
  knnr = KNeighborsRegressor(n_neighbors=n)
  knnr.fit(X_train, y_train)
  y_pred = knnr.predict(X_test)
  r2_score(y_test, y_pred)
  print(f'neighbors {n}: ', f'r2 = {r2_score(y_test, y_pred)}')#The best result: neighbors 7:  r2 = 0.6906706148432185

neighbors 1:  r2 = 0.5425517896232803
neighbors 2:  r2 = 0.6434065131709035
neighbors 3:  r2 = 0.6270272347495816
neighbors 4:  r2 = 0.6722796866735656
neighbors 5:  r2 = 0.6582367052513836
neighbors 6:  r2 = 0.673223529806327
neighbors 7:  r2 = 0.6906706148432185
neighbors 8:  r2 = 0.6759758874053867
neighbors 9:  r2 = 0.6535494468716617


In [120]:
dist = dict(metric=['euclidean', 'manhattan'],
            weights=['uniform', 'distance'],
            n_neighbors=randint(1,20))
random_search = RandomizedSearchCV(
    knnr,
    param_distributions=dist,
    n_iter=50,
    cv=5,
    random_state=32)

In [121]:
random_search.fit(X_train, y_train)
random_search.best_params_

{'metric': 'manhattan', 'n_neighbors': 17, 'weights': 'distance'}

In [122]:
y_pred = random_search.best_estimator_.predict(X_test)
r2_score(y_test, y_pred)

0.7078957364874698

In [123]:
# Decision Tree
dt = DecisionTreeRegressor(random_state=42)

In [124]:
dt_params = {
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

In [131]:
dt_random_search = RandomizedSearchCV(dt, param_distributions=dt_params, n_iter=50, cv=10, random_state=42)
dt_random_search.fit(X_train, y_train)
dt_random_search.best_params_

{'max_depth': 2, 'min_samples_leaf': 9, 'min_samples_split': 6}

In [132]:
dt_y_pred = dt_random_search.best_estimator_.predict(X_test)

In [133]:
r2_score(y_test, dt_y_pred)

0.4750595011023867

In [134]:
# random forest
rfr = RandomForestRegressor (random_state=42)

In [29]:
rf_params = {
    'n_estimators': [200, 300, 800],
    'max_depth': [10, 20, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

In [135]:
rf_random_search = RandomizedSearchCV(rfr, param_distributions=rf_params, n_iter=20, cv=10,  verbose=0,random_state=42)
rf_random_search.fit(X_train, y_train)
rf_random_search.best_params_

{'max_depth': None,
 'min_samples_leaf': 8,
 'min_samples_split': 6,
 'n_estimators': 800}

In [136]:
rf_y_pred = rf_random_search.best_estimator_.predict(X_test)

In [137]:
r2_score(y_test,rf_y_pred )

0.6195585779588892

In [145]:
# xgboost
xgbr = XGBRegressor(n_estimator=500, learning_rate=0.1, random_state=42)


In [146]:
xgb_params = {
    'n_estimators': [300, 500],
    'learning_rate': [ 0.1, 0.3],
    'max_depth': [3, 5, 10],
    'subsample': [0.7, 0.8, 1.0]
}

In [None]:
xgb_random_search = RandomizedSearchCV(xgbr, param_distributions=xgb_params, n_iter=50, cv=5, random_state=42)
xgb_random_search.fit(X_train, y_train)
xgb_random_search.best_params_

In [148]:
xgb_y_pred = xgb_random_search.best_estimator_.predict(X_test)

In [149]:

r2_score(y_test,xgb_y_pred )

0.6207184586757

In [150]:
# catboost
cbr = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=10, verbose=1)
# cbr.fit(X_train, y_train)

In [151]:
cat_params = {
    'iterations': [100,300 ,500],
    'learning_rate': [0.1],
    'depth': [6, 10]
}


In [None]:
cat_random_search = RandomizedSearchCV(cbr, param_distributions=cat_params, n_iter=50, cv=3, random_state=42)
cat_random_search.fit(X_train, y_train)
cat_random_search.best_params_

In [153]:
cat_y_pred = cat_random_search.best_estimator_.predict(X_test)

In [154]:
r2_score(y_test,cat_y_pred )

0.6812146353689861