In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.datasets import load_iris, load_diabetes, load_wine
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import BaseDecisionTree, plot_tree
import matplotlib.pyplot as plt
import json
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from TreeExports.iris.tree_trainer import export_names, export_forest_json, export_dataset
from sklearn.utils import shuffle


In [3]:
df = pd.read_csv("winequalityN.csv")

In [4]:
df["type"] = np.where(df["type"] == "white", 0, 1)
df["quality"] = np.where(df["quality"] <= 5, 0, 1)
df = df.dropna()

In [5]:
X = df.drop("quality", axis=1)

In [6]:
y = df.loc[:, "quality"]

In [7]:
X_columns = X.columns.tolist()

In [8]:
# dataset = load_iris()
#
# X = dataset.data
# y = dataset.target
# X_columns = dataset["feature_names"]

In [9]:
X, y = shuffle(X, y, random_state=14)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=34)

In [11]:
rf = RandomForestClassifier(n_estimators=5, verbose=1, random_state=14)
result = rf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [12]:
rf.score(X_test, y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


0.7858910891089109

In [13]:
# last_estimator = result['estimator'][-1]
export_forest_json(rf)
export_names(X_columns, ["Bad", "Good"])
export_dataset(X_train, y_train, X_columns, "trainDataset.csv")
export_dataset(X_test, y_test, X_columns, "testDataset.csv")

In [14]:
trees = rf.estimators_

In [16]:
tree1_predictions = trees[0].predict(X_train)
tree2_predictions = trees[1].predict(X_train)

np.count_nonzero((tree1_predictions == tree2_predictions) == False)



838

In [17]:
tree1_predictions == tree2_predictions

array([ True,  True,  True, ..., False,  True,  True])

In [18]:
np.where((tree1_predictions == tree2_predictions) == False)

(array([  18,   29,   42,   47,   48,   52,   57,   65,   70,   72,   77,
          78,   81,   83,   87,   92,   93,   95,   99,  100,  103,  104,
         107,  110,  111,  115,  119,  123,  133,  134,  135,  154,  158,
         160,  165,  167,  168,  174,  177,  182,  186,  189,  191,  201,
         205,  208,  216,  222,  225,  232,  234,  236,  250,  252,  257,
         258,  264,  268,  274,  278,  280,  282,  285,  289,  300,  308,
         314,  316,  319,  339,  342,  347,  356,  368,  377,  400,  402,
         407,  410,  417,  418,  421,  425,  433,  436,  444,  445,  461,
         462,  465,  466,  471,  475,  483,  487,  489,  496,  501,  507,
         508,  518,  519,  525,  528,  530,  534,  539,  553,  555,  558,
         581,  582,  584,  590,  592,  597,  622,  627,  631,  632,  641,
         643,  648,  649,  658,  660,  664,  676,  681,  684,  685,  699,
         704,  710,  721,  731,  747,  751,  754,  758,  760,  762,  778,
         782,  787,  791,  792,  794, 

In [None]:
320
544
882
985
1021
1550
2492
2546
2699
3046
3703
3894
4180
4594
4668

In [35]:
np.array(X_train.reset_index(drop=True))[320]

array([0.000e+00, 6.400e+00, 2.600e-01, 2.200e-01, 5.100e+00, 3.700e-02,
       2.300e+01, 1.310e+02, 9.944e-01, 3.290e+00, 3.200e-01, 1.010e+01])

In [47]:
X_test.shape

(1616, 12)

In [122]:
tree2_predictions

array([1., 1., 1., ..., 1., 1., 1.])

In [22]:
trees[0].get_depth()

24

In [130]:
np.array(X_train.reset_index(drop=True))[30].reshape(-1,12)

array([[0.000e+00, 6.800e+00, 2.900e-01, 5.600e-01, 1.190e+01, 4.300e-02,
        6.600e+01, 2.300e+02, 9.972e-01, 3.020e+00, 6.300e-01, 9.300e+00]])

In [133]:
trees[0].tree_.threshold[1125]

6.75

In [132]:
trees[0].apply(np.array(X_train.reset_index(drop=True))[30].reshape(-1,12))

array([0.])