In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.5


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import randint


In [4]:
# Load dataset
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['alcohol'] = data.target  # Assuming 'alcohol' is the target column

In [5]:
# Features and target
X = df.drop('alcohol', axis=1)
y = df['alcohol']

In [6]:
# Check duplicates
print("Duplicates in data: ", X.duplicated().sum())

Duplicates in data:  0


In [7]:
# Check null values
print("Null values in data: ", X.isna().sum().sum())

Null values in data:  0


In [8]:
# Scale features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Compute the mutual information between each feature and the target
mi = mutual_info_regression(X_scaled, y)

In [10]:
# Create a DataFrame to display the mutual information scores
mi_df = pd.DataFrame({
    'Feature': df.columns[:-1],
    'Mutual Information': mi
})


In [11]:
# Sort features by mutual information score
mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)

print("Mutual Information Scores:")
print(mi_df)

Mutual Information Scores:
                         Feature  Mutual Information
5                  total_phenols            0.673105
11  od280/od315_of_diluted_wines            0.564657
8                proanthocyanins            0.551197
10                           hue            0.509905
9                color_intensity            0.448444
4                      magnesium            0.396905
7           nonflavanoid_phenols            0.314026
0                        alcohol            0.285978
2                            ash            0.259153
6                     flavanoids            0.140946
3              alcalinity_of_ash            0.136312
1                     malic_acid            0.067721


In [12]:
# Selecting the top N features based on mutual information
N = 5  # Number of top features to select
top_features = mi_df['Feature'].head(N).values

print(f"Top {N} features selected based on Mutual Information:")
print(top_features)

Top 5 features selected based on Mutual Information:
['total_phenols' 'od280/od315_of_diluted_wines' 'proanthocyanins' 'hue'
 'color_intensity']


In [13]:
# Filter the dataset to keep only the top N features
X_selected = df[top_features]


In [14]:
# Train test split with selected features
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, shuffle=True, test_size=0.2, random_state=2024)

In [15]:
from sklearn.model_selection import cross_val_score, KFold

# Function to evaluate the model and check for overfitting
def evaluate_model(model, X_train, y_train, X_test, y_test, cross_val=True):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

    print(f"Train R2: {train_r2}, Test R2: {test_r2}")
    print(f"Train RMSE: {train_rmse}, Test RMSE: {test_rmse}")

    if cross_val:
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2')
        print(f"Cross-validation R2 scores: {cv_scores}")
        print(f"Mean CV R2: {cv_scores.mean()}")

    # Check for overfitting
    if (train_r2 - test_r2 > 0.15) and (train_r2 > 0.85):
        print("Warning: Potential overfitting detected!")
    else:
        print("No significant overfitting detected.")

In [16]:
# Polynomial features with higher degrees
degrees = [1, 2, 3]
for deg in degrees:
    poly = PolynomialFeatures(degree=deg)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)


In [18]:


    # Try different linear models: LinearRegression, Ridge, Lasso
    models = {
        "LinearRegression": LinearRegression(),
        "Ridge": Ridge(alpha=1.0),
        "Lasso": Lasso(alpha=0.1)
    }

    for name, model in models.items():
        model.fit(X_train_poly, y_train)
        y_pred = model.predict(X_test_poly)
        print(f'{name} with degree {deg}: R2 = {r2_score(y_test, y_pred)}, RMSE = {np.sqrt(mean_squared_error(y_test, y_pred))}')
        evaluate_model(model, X_train_poly, y_train, X_test_poly, y_test)


LinearRegression with degree 3: R2 = 0.6177879836153521, RMSE = 0.49385837422145656
Train R2: 0.7489386784404441, Test R2: 0.6177879836153521
Train RMSE: 0.38304335074865575, Test RMSE: 0.49385837422145656
Cross-validation R2 scores: [ 0.1650577  -0.97539701 -4.4004149  -0.36367273  0.5411469 ]
Mean CV R2: -1.0066560078236406
No significant overfitting detected.
Ridge with degree 3: R2 = 0.813674389635876, RMSE = 0.34481530188783055
Train R2: 0.8842781502633907, Test R2: 0.813674389635876
Train RMSE: 0.26005529805948957, Test RMSE: 0.34481530188783055
Cross-validation R2 scores: [0.89097017 0.78884685 0.79307042 0.8713419  0.73535526]
Mean CV R2: 0.8159169185148629
No significant overfitting detected.
Lasso with degree 3: R2 = 0.7899633565850086, RMSE = 0.36609836441327975
Train R2: 0.7970259541124989, Test R2: 0.7899633565850086
Train RMSE: 0.3444120016729461, Test RMSE: 0.36609836441327975
Cross-validation R2 scores: [0.83833992 0.69526431 0.74360094 0.79311391 0.5465378 ]
Mean CV R2

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [19]:
# KNN regression with optimized hyperparameters using RandomizedSearchCV
knn_params = {
    'n_neighbors': randint(1, 30),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn = KNeighborsRegressor()
random_search_knn = RandomizedSearchCV(knn, param_distributions=knn_params, n_iter=100, cv=5, random_state=42)
random_search_knn.fit(X_train, y_train)

y_pred_knn = random_search_knn.best_estimator_.predict(X_test)
print(f'Optimized KNN: R2 = {r2_score(y_test, y_pred_knn)}, RMSE = {np.sqrt(mean_squared_error(y_test, y_pred_knn))}')
evaluate_model(random_search_knn.best_estimator_, X_train, y_train, X_test, y_test)

Optimized KNN: R2 = 0.9564049470113218, RMSE = 0.16678955844662027
Train R2: 1.0, Test R2: 0.9564049470113218
Train RMSE: 0.0, Test RMSE: 0.16678955844662027
Cross-validation R2 scores: [0.89398749 0.87125119 0.81996278 0.97988192 0.70796646]
Mean CV R2: 0.854609970014119
No significant overfitting detected.


In [20]:
# Decision Tree with hyperparameter tuning
dt_params = {
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}
dt = DecisionTreeRegressor(random_state=42)
random_search_dt = RandomizedSearchCV(dt, param_distributions=dt_params, n_iter=100, cv=5, random_state=42)
random_search_dt.fit(X_train, y_train)

y_pred_dt = random_search_dt.best_estimator_.predict(X_test)
print(f'Optimized Decision Tree: R2 = {r2_score(y_test, y_pred_dt)}, RMSE = {np.sqrt(mean_squared_error(y_test, y_pred_dt))}')
evaluate_model(random_search_dt.best_estimator_, X_train, y_train, X_test, y_test)

Optimized Decision Tree: R2 = 0.7610014106497383, RMSE = 0.39052417428914427
Train R2: 0.8946551554821163, Test R2: 0.7610014106497383
Train RMSE: 0.2481216529881026, Test RMSE: 0.39052417428914427
Cross-validation R2 scores: [0.80883139 0.58989229 0.84313778 0.94677573 0.6304127 ]
Mean CV R2: 0.7638099772680881
No significant overfitting detected.


In [21]:
# Random Forest with hyperparameter tuning
rf_params = {
    'n_estimators': [200, 500, 1000],
    'max_depth': [10, 20, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}
rfr = RandomForestRegressor(random_state=42)
random_search_rf = RandomizedSearchCV(rfr, param_distributions=rf_params, n_iter=100, cv=5, random_state=42)
random_search_rf.fit(X_train, y_train)

y_pred_rf = random_search_rf.best_estimator_.predict(X_test)
print(f'Optimized Random Forest: R2 = {r2_score(y_test, y_pred_rf)}, RMSE = {np.sqrt(mean_squared_error(y_test, y_pred_rf))}')
evaluate_model(random_search_rf.best_estimator_, X_train, y_train, X_test, y_test)


Optimized Random Forest: R2 = 0.8757485030229746, RMSE = 0.28157952300864814
Train R2: 0.9789257491513917, Test R2: 0.8757485030229746
Train RMSE: 0.11097728482207013, Test RMSE: 0.28157952300864814
Cross-validation R2 scores: [0.82563337 0.75019957 0.84698426 0.95164602 0.71600354]
Mean CV R2: 0.818093351815906
No significant overfitting detected.


In [22]:
# XGBoost with tuned hyperparameters
xgb_params = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 10],
    'subsample': [0.7, 0.8, 1.0]
}
xgbr = XGBRegressor(random_state=42)
random_search_xgb = RandomizedSearchCV(xgbr, param_distributions=xgb_params, n_iter=100, cv=5, random_state=42)
random_search_xgb.fit(X_train, y_train)

y_pred_xgb = random_search_xgb.best_estimator_.predict(X_test)
print(f'Optimized XGBoost: R2 = {r2_score(y_test, y_pred_xgb)}, RMSE = {np.sqrt(mean_squared_error(y_test, y_pred_xgb))}')
evaluate_model(random_search_xgb.best_estimator_, X_train, y_train, X_test, y_test)




Optimized XGBoost: R2 = 0.9206273120996276, RMSE = 0.22505351368695775
Train R2: 0.9748198333224636, Test R2: 0.9206273120996276
Train RMSE: 0.12130740789940261, Test RMSE: 0.22505351368695775
Cross-validation R2 scores: [0.73934265 0.78638502 0.8193478  0.97825013 0.71533489]
Mean CV R2: 0.8077320997960182
No significant overfitting detected.


In [23]:
cat_params = {
    'iterations': [100, 500],
    'learning_rate': [0.1],
    'depth': [6, 10]
}

cbr = CatBoostRegressor(random_state=42, verbose=0, early_stopping_rounds=10)  # اضافه کردن early stopping

random_search_cat = RandomizedSearchCV(cbr, param_distributions=cat_params, n_iter=10, cv=3, random_state=42)
random_search_cat.fit(X_train, y_train)

y_pred_cat = random_search_cat.best_estimator_.predict(X_test)
print(f'Optimized CatBoost: R2 = {r2_score(y_test, y_pred_cat)}, RMSE = {np.sqrt(mean_squared_error(y_test, y_pred_cat))}')
evaluate_model(random_search_cat.best_estimator_, X_train, y_train, X_test, y_test)




Optimized CatBoost: R2 = 0.9352633120745363, RMSE = 0.20324763091105302
Train R2: 0.9848784811716058, Test R2: 0.9352633120745363
Train RMSE: 0.09400603380463451, Test RMSE: 0.20324763091105302
Cross-validation R2 scores: [0.80282203 0.84742448 0.82268006 0.94847594 0.79604415]
Mean CV R2: 0.843489334152518
No significant overfitting detected.


In [24]:
# Additional ensemble method: Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1, max_depth=3, random_state=42)
gbr.fit(X_train, y_train)

y_pred_gbr = gbr.predict(X_test)
print(f'Gradient Boosting: R2 = {r2_score(y_test, y_pred_gbr)}, RMSE = {np.sqrt(mean_squared_error(y_test, y_pred_gbr))}')
evaluate_model(gbr, X_train, y_train, X_test, y_test)

Gradient Boosting: R2 = 0.9366818040738116, RMSE = 0.20100854511456062
Train R2: 0.9999933813484752, Test R2: 0.9366818040738116
Train RMSE: 0.001966721115070507, Test RMSE: 0.20100854511456062
Cross-validation R2 scores: [0.77270596 0.76467832 0.80841835 0.96319398 0.69646104]
Mean CV R2: 0.801091528383369
No significant overfitting detected.


In [25]:
# List to store R2 scores
r2_scores = {}

# Existing models evaluations
for name, model in models.items():
    y_pred = model.predict(X_test_poly)
    r2 = r2_score(y_test, y_pred)
    r2_scores[f'{name} (degree {deg})'] = r2

r2_scores['Optimized KNN'] = r2_score(y_test, y_pred_knn)
r2_scores['Optimized Decision Tree'] = r2_score(y_test, y_pred_dt)
r2_scores['Optimized Random Forest'] = r2_score(y_test, y_pred_rf)
r2_scores['Optimized XGBoost'] = r2_score(y_test, y_pred_xgb)
r2_scores['Optimized CatBoost'] = r2_score(y_test, y_pred_cat)
r2_scores['Gradient Boosting'] = r2_score(y_test, y_pred_gbr)

# Print R2 scores for all models
print("\nR2 Scores for all models:")
for model_name, r2 in r2_scores.items():
    print(f'{model_name}: R2 = {r2}')

# Identify the model with the best R2 score
best_model = max(r2_scores, key=r2_scores.get)
best_r2 = r2_scores[best_model]

print(f'\nThe model with the best R2 score is {best_model} with R2 = {best_r2}')



R2 Scores for all models:
LinearRegression (degree 3): R2 = 0.6177879836153521
Ridge (degree 3): R2 = 0.813674389635876
Lasso (degree 3): R2 = 0.7899633565850086
Optimized KNN: R2 = 0.9564049470113218
Optimized Decision Tree: R2 = 0.7610014106497383
Optimized Random Forest: R2 = 0.8757485030229746
Optimized XGBoost: R2 = 0.9206273120996276
Optimized CatBoost: R2 = 0.9352633120745363
Gradient Boosting: R2 = 0.9366818040738116

The model with the best R2 score is Optimized KNN with R2 = 0.9564049470113218
