# Tarea - Parte 1

Utiliza la base de datos de IRIS para generar un voting clasifier hard y soft, ¿cúal es el mejor en esta base de datos?

In [1]:
import sklearn
import numpy as np

# Seed to make the output stable across runs
np.random.seed(42)

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()

In [3]:
iris['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [4]:
X = iris['data'][:,1:3] #sepal width and petal length
y = iris['target']

In [5]:
X.shape

(150, 2)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3,random_state=42)

In [7]:
X_train.shape

(105, 2)

In [8]:
X_test.shape

(45, 2)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(random_state=42,solver='liblinear',multi_class='ovr')
rnd_clf = RandomForestClassifier(random_state=42,n_estimators=5)
svm_clf = SVC(probability=True,random_state=42,gamma='auto')

### <font color=#50AAF5>Hard voting classifier</font>

In [10]:
voting_clf_hard = VotingClassifier(estimators=[('lr', log_clf), 
                                               ('rf', rnd_clf), 
                                               ('svc', svm_clf)],
                                   voting='hard')

In [11]:
voting_clf_hard.fit(X_train, y_train);

In [12]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf_hard):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.9111111111111111
RandomForestClassifier 0.9111111111111111
SVC 0.9777777777777777
VotingClassifier 0.9555555555555556


### <font color=#50AAF5>Soft voting classifier</font>

In [13]:
voting_clf_soft = VotingClassifier(estimators=[('lr', log_clf), 
                                               ('rf', rnd_clf), 
                                               ('svc', svm_clf)],
                                   voting='soft')

In [14]:
voting_clf_soft.fit(X_train, y_train);

In [15]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf_soft):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.9111111111111111
RandomForestClassifier 0.9111111111111111
SVC 0.9777777777777777
VotingClassifier 0.9777777777777777


**Soft voting obtuvo un mejor performance**

# Tarea - Parte 2

Realiza un clasificador para IRIS utilizando BaggingClassifier(base_estimator=TUCLASIFICADOR)que tenga n_estimators=100, puedes elegir cualquier clasificador de tu preferencia.

In [16]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(base_estimator = RandomForestClassifier(random_state=42), 
                            n_estimators=100, 
                            max_samples=80, 
                            bootstrap=True, 
                            n_jobs=-1, 
                            random_state=42)

In [17]:
bag_clf.fit(X_train, y_train);

In [18]:
y_pred = bag_clf.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

0.9555555555555556


# Tarea - Parte 3

Utiliza la base de datos del vino para generar un clasificador de random forest y evalua las importancias de sus variables.

In [20]:
from sklearn.datasets import load_wine

data_wine = load_wine()

In [21]:
dir(data_wine)

['DESCR', 'data', 'feature_names', 'target', 'target_names']

In [22]:
X = data_wine.data
X.shape

(178, 13)

In [23]:
y = data_wine.target
y.shape

(178,)

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3,random_state=42)

In [25]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=5, n_jobs=-1, random_state=42)
rf_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)

In [26]:
from sklearn import metrics

y_pred = rf_clf.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       1.00      0.95      0.98        21
           2       1.00      1.00      1.00        14

    accuracy                           0.98        54
   macro avg       0.98      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54



In [27]:
score = accuracy_score(y_test, y_pred) # Check the accuracy of our predictions, 1 = 100%
score

0.9814814814814815

In [28]:
import pandas as pd
df = pd.DataFrame({"Feature_names":data_wine['feature_names'],"Importances":rf_clf.feature_importances_})

In [29]:
df.sort_values(by="Importances", ascending = False)

Unnamed: 0,Feature_names,Importances
9,color_intensity,0.183262
6,flavanoids,0.176255
0,alcohol,0.143968
12,proline,0.126772
10,hue,0.091406
11,od280/od315_of_diluted_wines,0.088347
5,total_phenols,0.065115
4,magnesium,0.033257
1,malic_acid,0.029399
8,proanthocyanins,0.027257
