In [1]:
from numpy import array
from sklearn.model_selection import KFold

In [2]:
# data sample
data = array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
data

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])

In [4]:
# prepare cross validation
kfold = KFold(n_splits = 3, shuffle = True, random_state = 1)

In [5]:
# enumerate splits
for train, test in kfold.split(data):
    print('train: %s, test: %s' % (data[train], data[test]))

train: [0.1 0.4 0.5 0.6], test: [0.2 0.3]
train: [0.2 0.3 0.4 0.6], test: [0.1 0.5]
train: [0.1 0.2 0.3 0.5], test: [0.4 0.6]


In [6]:
# prepare cross validation
kfold = KFold(n_splits = 3, shuffle = False, random_state = None)

In [7]:
# enumerate splits
for train, test in kfold.split(data):
    print('train: %s, test: %s' % (data[train], data[test]))

train: [0.3 0.4 0.5 0.6], test: [0.1 0.2]
train: [0.1 0.2 0.5 0.6], test: [0.3 0.4]
train: [0.1 0.2 0.3 0.4], test: [0.5 0.6]


In [19]:
from sklearn.linear_model import LinearRegression
LinearRegression()

LinearRegression()

In [21]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier()

RandomForestClassifier()

In [None]:
n_estimators = 100,
*,
criterion='gini',
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
bootstrap=True,
oob_score=False,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None,
ccp_alpha=0.0,
max_samples=None,

In [8]:
import numpy as np
import pandas as pd

In [9]:
df = pd.read_csv('wineQualityReds.csv')

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,5,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [11]:
X = df.iloc[:, 1:11].astype(int)
y = df.iloc[:, 11].astype(int)

In [12]:
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [15]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100)

In [16]:
from sklearn.model_selection import cross_val_score
all_accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5)

In [17]:
print(all_accuracies)

[0.52734375 0.52734375 0.58203125 0.53515625 0.54117647]


In [18]:
print(all_accuracies.mean())

0.542610294117647


### Grid Search CV

In [22]:
import itertools

p1 = ["A", "B", "C", "D"]
p2 = [True, False]

c = itertools.product(p1, p2)
for i in c:
    print(i)

('A', True)
('A', False)
('B', True)
('B', False)
('C', True)
('C', False)
('D', True)
('D', False)


In [None]:
RandomForestClassifier(
    n_estimators=100,
     criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,

In [23]:
grid_param = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [24]:
from sklearn.model_selection import GridSearchCV
import time

gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

In [25]:
start = time.time()
gd_sr.fit(X_train, y_train)
print('It Got Executed in', time.time() - start, 'Seconds')

It Got Executed in 125.22409915924072 Seconds


In [26]:
best_parameters = gd_sr.best_params_
print(best_parameters)

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 300}


In [27]:
best_result = gd_sr.best_score_
print(best_result)

0.547297794117647


In [28]:
clf = RandomForestClassifier(bootstrap = True, criterion = 'gini', n_estimators = 300)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=300)

In [29]:
ypred = clf.predict(X_test)

### RandomSearchCV

In [30]:
rf_params={'max_depth':[3,5,10],
              'max_features':(1,2,3,4,5,6),
               'criterion':['gini','entropy'],
               'bootstrap':[True,False],
               'min_samples_leaf':(2,3,5,7,8,9,10)
              }

In [31]:
from sklearn.model_selection import RandomizedSearchCV
gd_sr = RandomizedSearchCV(classifier, rf_params, cv = 5, scoring = "accuracy", random_state = 0, refit = False)

In [32]:
start = time.time()
search = gd_sr.fit(X_train, y_train)
print('It Got Executed in', time.time() - start, 'Seconds')

It Got Executed in 16.42681384086609 Seconds


In [33]:
search.best_params_

{'min_samples_leaf': 2,
 'max_features': 3,
 'max_depth': 10,
 'criterion': 'entropy',
 'bootstrap': False}

In [34]:
best_result = search.best_score_
print(best_result)

0.5207107843137255
