In [1]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

# IRIS Flower Dataset

In [2]:
iris = datasets.load_iris()
X_train, y_train = iris.data[:, 1:3], iris.target

In [88]:
X_train.shape

(150, 2)

# TRAIN MANY CLASSIFIER

In [89]:
logClf = LogisticRegression(solver='lbfgs', multi_class='multinomial',random_state=1)
rfClf = RandomForestClassifier(n_estimators=50, random_state=1)
gaussClf = GaussianNB()


# VOTING

In [63]:
eclf = VotingClassifier(estimators=[('lr', logClf), ('rf', rfClf), ('gnb', gaussClf)], voting='hard')

In [64]:
eclf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=1, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)), ('rf', Ran...andom_state=1, verbose=0, warm_start=False)), ('gnb', GaussianNB(priors=None, var_smoothing=1e-09))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

# Accuracy 

In [65]:
for clf, label in zip([logClf, rfClf, gaussClf, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
...     scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
...     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))



Accuracy: 0.95 (+/- 0.04) [Logistic Regression]
Accuracy: 0.94 (+/- 0.04) [Random Forest]
Accuracy: 0.91 (+/- 0.04) [naive Bayes]
Accuracy: 0.95 (+/- 0.04) [Ensemble]


# Soft Voting

In [67]:
softeclf = VotingClassifier(estimators=[('lr', logClf), ('rf', rfClf), ('gnb', gaussClf)], voting='soft',weights=[1, 3, 3])

In [68]:
for pclf, label in zip([logClf, rfClf, gaussClf, softeclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
...     scores = cross_val_score(pclf, X_train, y_train, cv=5, scoring='accuracy')
...     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
...     print(scores)

Accuracy: 0.95 (+/- 0.04) [Logistic Regression]
[0.93333333 1.         0.9        0.93333333 1.        ]
Accuracy: 0.94 (+/- 0.04) [Random Forest]
[0.93333333 1.         0.9        0.9        0.96666667]
Accuracy: 0.91 (+/- 0.04) [naive Bayes]
[0.86666667 0.93333333 0.86666667 0.93333333 0.96666667]
Accuracy: 0.95 (+/- 0.03) [Ensemble]
[0.93333333 0.96666667 0.9        0.93333333 1.        ]


In [69]:
softeclf.fit(X_train,y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=1, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)), ('rf', Ran...andom_state=1, verbose=0, warm_start=False)), ('gnb', GaussianNB(priors=None, var_smoothing=1e-09))],
         flatten_transform=None, n_jobs=None, voting='soft',
         weights=[1, 3, 3])

# Bagging

In [70]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier



In [71]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=50, bootstrap=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)



BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=50, n_estimators=500, n_jobs=-1, oob_score=False,
         random_state=42, verbose=0, warm_start=False)

In [72]:
scores = cross_val_score(bag_clf, X_train, y_train, 
                         cv=5, scoring='accuracy')
print(scores.mean())

0.9533333333333334


# PASTING

In [73]:
past_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=50, bootstrap=False, n_jobs=-1, random_state=42)
past_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=False, bootstrap_features=False, max_features=1.0,
         max_samples=50, n_estimators=500, n_jobs=-1, oob_score=False,
         random_state=42, verbose=0, warm_start=False)

In [74]:
scores = cross_val_score(past_clf, X_train, y_train, cv=5, scoring='accuracy')
print(scores.mean())

0.9533333333333334


# Out of bag

In [75]:
oobag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=50, bootstrap=True, n_jobs=-1, random_state=42,oob_score=True)
oobag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=50, n_estimators=500, n_jobs=-1, oob_score=True,
         random_state=42, verbose=0, warm_start=False)

In [76]:
oobag_clf.oob_score_

0.94

In [81]:

random_forest = RandomForestClassifier(n_estimators=100,oob_score=True)
random_forest.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [82]:
random_forest.score(X_train, y_train)

0.9866666666666667

In [83]:
random_forest.oob_score_

0.9333333333333333

In [85]:
random_forest.feature_importances_

array([0.1734387, 0.8265613])

In [5]:
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# load dataset into Pandas DataFrame
df = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target'])

In [7]:
df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
from sklearn.preprocessing import StandardScaler
features = ['sepal length', 'sepal width', 'petal length', 'petal width']
# Separating out the features
x = df.loc[:, features].values
# Separating out the target
y = df.loc[:,['target']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [9]:
finalDf = pd.concat([principalDf, df[['target']]], axis = 1)

In [10]:
finalDf.head()

Unnamed: 0,principal component 1,principal component 2,target
0,-2.264542,0.505704,Iris-setosa
1,-2.086426,-0.655405,Iris-setosa
2,-2.36795,-0.318477,Iris-setosa
3,-2.304197,-0.575368,Iris-setosa
4,-2.388777,0.674767,Iris-setosa


In [16]:
pca.explained_variance_ratio_

array([0.72770452, 0.23030523])

In [20]:
#import seaborn as sns