In [166]:
import pandas as pd
import numpy as np

# Загрузка и подготовка данных

In [167]:
#импортируем подготовленный датасет
pokemon_df = pd.read_csv('../data/pokemon_preprocessed.csv')

In [168]:
#убеждаемся, что загрузка прошла успешно
pokemon_df.head()

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Bug,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,318,45,49,49,65,65,45,1,0,0,...,0,1,0,0,0,1,0,0,0,0
1,405,60,62,63,80,80,60,1,0,0,...,0,1,0,0,0,1,0,0,0,0
2,525,80,82,83,100,100,80,1,0,0,...,0,1,0,0,0,1,0,0,0,0
3,625,80,100,123,122,120,80,1,0,0,...,0,1,0,0,0,1,0,0,0,0
4,309,39,52,43,60,50,65,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [169]:
#нормируем вещественные признаки
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(pokemon_df.iloc[:,0:7])
scaled_df = pd.DataFrame(np.array(scaler.transform(pokemon_df.iloc[:,0:7])), columns = ["Total","HP","Attack","Defense","Sp_Atk","Sp_Def","Speed"], index = pokemon_df.index)

In [170]:
pokemon_df = pd.concat([scaled_df, pokemon_df.iloc[:,7:]], axis = "columns")
pokemon_df

Unnamed: 0,Total,HP,Attack,Defense,Sp_Atk,Sp_Def,Speed,Generation,Legendary,Bug,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,-0.976765,-0.950626,-0.924906,-0.797154,-0.239130,-0.248189,-0.801503,1,0,0,...,0,1,0,0,0,1,0,0,0,0
1,-0.251088,-0.362822,-0.524130,-0.347917,0.219560,0.291156,-0.285015,1,0,0,...,0,1,0,0,0,1,0,0,0,0
2,0.749845,0.420917,0.092448,0.293849,0.831146,1.010283,0.403635,1,0,0,...,0,1,0,0,0,1,0,0,0,0
3,1.583957,0.420917,0.647369,1.577381,1.503891,1.729409,0.403635,1,0,0,...,0,1,0,0,0,1,0,0,0,0
4,-1.051836,-1.185748,-0.832419,-0.989683,-0.392027,-0.787533,-0.112853,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,1.375429,-0.754692,0.647369,2.443765,0.831146,2.808099,-0.629341,6,1,0,...,0,0,0,0,0,0,0,1,0,0
796,2.209541,-0.754692,2.497104,1.160233,2.665905,1.369846,1.436611,6,1,0,...,0,0,0,0,0,0,0,1,0,0
797,1.375429,0.420917,0.955658,-0.444182,2.360112,2.088973,0.059310,6,1,0,...,1,0,0,0,0,0,1,0,0,0
798,2.042718,0.420917,2.497104,-0.444182,2.971699,2.088973,0.403635,6,1,0,...,0,0,0,0,0,0,1,0,0,0


In [171]:
#создадим датафреймы X и y
y = pokemon_df.Legendary
pokemon_df = pokemon_df.drop(['Legendary'], axis=1)
X = pokemon_df

In [172]:
from sklearn.model_selection import train_test_split
#разбиваем всю выборку на обучающую и тестовую
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 2)

In [173]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy


# Basic Random Forest Classifier

In [174]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import regression_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

In [175]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier()

In [176]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[146   3]
 [  0  11]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       149
           1       0.79      1.00      0.88        11

    accuracy                           0.98       160
   macro avg       0.89      0.99      0.93       160
weighted avg       0.99      0.98      0.98       160



# Basic Random Forest Classifier Hyperparameter Tuning

In [177]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

In [178]:
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 0.0375 degrees.
Accuracy = -inf%.


In [179]:
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [180]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Fitting 2 folds for each of 288 candidates, totalling 576 fits


KeyboardInterrupt: 

# Gradient Boosting Classifier

In [None]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Gradient Boosting Classifier Hyperparameter Tuning

In [None]:
parameters = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

In [None]:
grid_search = GridSearchCV(estimator = clf, param_grid = parameters, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

# Random Forest Stacking Classifier 

In [None]:
estimators = [
     ('rf', RandomForestClassifier(n_estimators=10, random_state=42))]
clf = StackingClassifier(
     estimators=estimators, final_estimator=LogisticRegression())

In [None]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Random Forest Stacking Classifier Hyperparameter Tuning

In [None]:
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

# Random Forest Bagging Classifier 

In [None]:
clf = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=10, random_state=42), n_estimators=10, random_state=0)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Random Forest Bagging Classifier Hyperparameter Tuning

In [None]:
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))