In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/aniruddhachoudhury/Red-Wine-Quality/master/winequality-red.csv")

In [3]:
df = df.drop_duplicates()

In [4]:
X = df.drop("quality",axis = 1)
y = df.quality

In [5]:
#split the dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33, random_state=10)

In [6]:
#AdaBoost
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()

In [7]:
model.fit(X_train,y_train)

In [8]:
y_pred  = model.predict(X_test)

In [9]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test,y_pred)
score

0.5211581291759465

### Score is 52.11% 

In [10]:
#Hyperparameter Tuning
grid_param = {
    "n_estimators":[90,100,130,150],
    "learning_rate":[0.001,0.01,0.1,0.5]
}

In [11]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = model,param_grid=grid_param,cv = 3,verbose = 2,n_jobs= -1)

In [12]:
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [13]:
grid_search.best_params_

{'learning_rate': 0.001, 'n_estimators': 90}

In [14]:
grid_search.best_score_

0.5703455213942447

In [15]:
# creating new model
model_2 = AdaBoostClassifier(learning_rate= 0.001, n_estimators= 90)

In [16]:
model_2.fit(X_train,y_train)

In [17]:
y_pred_2 = model_2.predict(X_test)

In [18]:
accuracy_score(y_test,y_pred_2)

0.5278396436525612

### After hyperparamter tuning the accuracy has increased 52.7%

# GradientBoost and Xgboost 

In [19]:
import xgboost

In [20]:
from sklearn.ensemble import GradientBoostingClassifier
model_gbm = GradientBoostingClassifier()

In [21]:
model_gbm.fit(X_train,y_train)

In [22]:
y_pred_gbm = model_gbm.predict(X_test)

In [23]:
accuracy_score(y_test,y_pred_gbm)

0.576837416481069

### We are getting improved accuracy of 57.68%

In [34]:
#Hyperparameter tuning with respect to GradientBoost
from sklearn.model_selection import GridSearchCV
grid_param = {
    "learning_rate":[0.01,0.001,0.1,0.0001]
    
}
grid_search = GridSearchCV(estimator = model,param_grid= grid_param,cv= 3,verbose =3)

In [35]:
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END ................learning_rate=0.01;, score=0.576 total time=   0.2s
[CV 2/3] END ................learning_rate=0.01;, score=0.558 total time=   0.2s
[CV 3/3] END ................learning_rate=0.01;, score=0.578 total time=   0.2s
[CV 1/3] END ...............learning_rate=0.001;, score=0.556 total time=   0.2s
[CV 2/3] END ...............learning_rate=0.001;, score=0.558 total time=   0.2s
[CV 3/3] END ...............learning_rate=0.001;, score=0.597 total time=   0.2s
[CV 1/3] END .................learning_rate=0.1;, score=0.447 total time=   0.1s
[CV 2/3] END .................learning_rate=0.1;, score=0.515 total time=   0.1s
[CV 3/3] END .................learning_rate=0.1;, score=0.545 total time=   0.2s
[CV 1/3] END ..............learning_rate=0.0001;, score=0.556 total time=   0.4s
[CV 2/3] END ..............learning_rate=0.0001;, score=0.558 total time=   0.2s
[CV 3/3] END ..............learning_rate=0.0001;,

In [36]:
grid_search.best_params_

{'learning_rate': 0.001}

In [37]:
grid_search.best_score_

0.5703455213942447

In [38]:
model_gbm2 = GradientBoostingClassifier(learning_rate = 0.001)
model_gbm2.fit(X_train,y_train)

In [39]:
y_pred_model_gbm2 = model_gbm2.predict(X_test)
accuracy_score(y_test,y_pred_model_gbm2)

0.49888641425389757

In [25]:
#Xgboost
model_xgb = xgboost.XGBClassifier()

In [27]:
#using labelencoder in y_train column
from sklearn.preprocessing import LabelEncoder
labelenc = LabelEncoder()
y_train_encoded = labelenc.fit_transform(y_train)

In [28]:
model_xgb.fit(X_train,y_train_encoded)

In [29]:
y_pred_xgb = model_xgb.predict(X_test)

In [31]:
y_pred_xgb = labelenc.inverse_transform(y_pred_xgb)

In [32]:
accuracy_score(y_test,y_pred_xgb)

0.5634743875278396

### Accuarcy has decreased further more as per the review

In [55]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
     -------------------------------------- 404.2/404.2 kB 2.5 MB/s eta 0:00:00
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.10.0
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
     -------------------------------------- 226.0/226.0 kB 2.0 MB/s eta 0:00:00
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.7/78.7 kB 4.3 MB/s eta 0:00:00
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.0 cmaes-0.10.0 colorlog-6.7.0 optuna-3.3.0


In [62]:
import optuna
def objective_classification(trial,data=X,target=y) :
    label = LabelEncoder()
    train_x , test_x , train_y , test_y= train_test_split(data , target , test_size = .20 ,random_state=10)
    
    param  = {
        'tree_method' :'gpu_hist',
        'verbosity' : 3,
        'objective' : "binary:logistics" , 
        'booster' : trial.suggest_categorical('booster' , ['dart' , 'gbtree','gblinear']),
        'lambda' : trial.suggest_float('lambda' , 1e-4 , 1),
        'alpha' :trial.suggest_float('alpha' , 1e-4 , 1),
        'subsample' : trial.suggest_float('subsample' , .1,.5),
        'colsample_bytree' : trial.suggest_float('colsample_bytree' , .1 ,.5)
        
    }
   
    if param['booster'] in ['gbtree' , 'dart']:
        param['gamma'] :trial.suggest_float('gamma' , 1e-3 , 4 )
        param['eta'] : trial.suggest_float('eta' , .001 ,5 )
    
    train_y = label.fit_transform(train_y)
    xgb_classification = xgboost.XGBClassifier(**param)
    xgb_classification.fit(train_x , train_y,eval_set  = [(test_x, test_y)])
    pred = xgb_classification.predict(test_x)
    pred = label.inverse_transform(pred)
    accuracy  = xgb_classification.score(test_x , test_y)

    
    
    return accuracy

In [63]:
xgb_classification_optuna = optuna.create_study()

[I 2023-10-13 01:47:28,379] A new study created in memory with name: no-name-c9e94e44-d3aa-420e-be01-2235f17ff1f0


In [64]:
xgb_classification_optuna.optimize(objective_classification , n_trials = 100 )


    E.g. tree_method = "hist", device = "cuda"

[W 2023-10-13 01:47:28,619] Trial 0 failed with parameters: {'booster': 'dart', 'lambda': 0.02315454042199464, 'alpha': 0.4059870799010132, 'subsample': 0.1216437069455302, 'colsample_bytree': 0.10951480332597519} because of the following error: XGBoostError('[01:47:28] C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-0cec3277c4d9d0165-1\\xgboost\\xgboost-ci-windows\\src\\tree\\updater_gpu_hist.cu:781: Exception in gpu_hist: [01:47:28] C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-0cec3277c4d9d0165-1\\xgboost\\xgboost-ci-windows\\src\\tree\\updater_gpu_hist.cu:787: Check failed: ctx_->gpu_id >= 0 (-1 vs. 0) : Must have at least one device\n').
Traceback (most recent call last):
  File "C:\Users\Prince\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Prince\AppData\Local\Temp\ipykernel_6496\469131784.py", line 2

[01:47:28] AllReduce: 0.001716s, 1 calls @ 1716us

[01:47:28] MakeCuts: 0.001773s, 1 calls @ 1773us

[01:47:28] DEBUG: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0cec3277c4d9d0165-1\xgboost\xgboost-ci-windows\src\gbm\gbtree.cc:130: Using tree method: 5
[01:47:28] DEBUG: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0cec3277c4d9d0165-1\xgboost\xgboost-ci-windows\src\tree\updater_gpu_hist.cu:744: [GPU Hist]: Configure


XGBoostError: [01:47:28] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0cec3277c4d9d0165-1\xgboost\xgboost-ci-windows\src\tree\updater_gpu_hist.cu:781: Exception in gpu_hist: [01:47:28] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0cec3277c4d9d0165-1\xgboost\xgboost-ci-windows\src\tree\updater_gpu_hist.cu:787: Check failed: ctx_->gpu_id >= 0 (-1 vs. 0) : Must have at least one device
