In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import numpy as np
import pandas as pd
from sklearn import metrics

In [19]:
help(RandomForestClassifier)

Help on class RandomForestClassifier in module sklearn.ensemble.forest:

class RandomForestClassifier(ForestClassifier)
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and use averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is always the same as the original
 |  input sample size but the samples are drawn with replacement if
 |  `bootstrap=True` (default).
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------
 |  n_estimators : integer, optional (default=10)
 |      The number of trees in the forest.
 |  
 |  criterion : string, optional (default="gini")
 |      The function to measure the quality of a split. Supported criteria are
 |      "gini" for the Gini impurity and "entropy" for the information gain.
 |      Note: this parameter is tree-specific.
 |  
 |  max_features : int, fl

In [20]:
from sklearn import datasets
iris = datasets.load_iris()

data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [21]:
data['train'] = np.random.uniform(0, 1, len(data))
err = 0
for i in range(10):
    data_test = data[(data['train'] >= i*0.1) & (data['train'] < (i+1)*0.1)]
    data_train = data[(data['train'] < i*0.1) | (data['train'] >= (i+1)*0.1)]

    y = data_train['species']
    X = data_train.drop('species', axis = 1).drop('train',axis=1)

    t = RandomForestClassifier(n_estimators=30)
    t = t.fit(X, y)
#     print('depth', [i.tree_.max_depth for i in t.estimators_])
    y = data_test['species']
    X = data_test.drop('species', axis = 1).drop('train',axis=1)
    print(t.score(X,y))
    err += t.score(X,y) / 10

print('Average error: ',err)

1.0
0.888888888889
0.875
1.0
0.947368421053
0.909090909091
1.0
1.0
1.0
0.944444444444
Average error:  0.956479266348


In [22]:
data['train'] = np.random.uniform(0, 1, len(data))
err = 0
for i in range(10):
    data_test = data[(data['train'] >= i*0.1) & (data['train'] < (i+1)*0.1)]
    data_train = data[(data['train'] < i*0.1) | (data['train'] >= (i+1)*0.1)]

    y = data_train['species']
    X = data_train.drop('species', axis = 1).drop('train',axis=1)

    t = ExtraTreesClassifier(n_estimators=30)
    t = t.fit(X, y)
#     print('depth', [i.tree_.max_depth for i in t.estimators_])
    y = data_test['species']
    X = data_test.drop('species', axis = 1).drop('train',axis=1)
    print(t.score(X,y))
    err += t.score(X,y) / 10

print('Average error: ',err)

1.0
1.0
0.818181818182
1.0
1.0
0.888888888889
1.0
0.88
1.0
0.928571428571
Average error:  0.951564213564


In [23]:
data = pd.read_csv("/home/dominik/Dokumenty/Studia/Data-mining/Lista5-trees/train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
data = data.dropna()
data['Sex'] = pd.Categorical(data['Sex']).codes
data['Embarked'] = pd.Categorical(data['Embarked']).codes
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [25]:
data['train'] = np.random.uniform(0, 1, len(data))
err = 0
for i in range(10):
    data_test = data[(data['train'] >= i*0.1) & (data['train'] < (i+1)*0.1)]
    data_train = data[(data['train'] < i*0.1) | (data['train'] >= (i+1)*0.1)]

    y = data_train['Survived']
    X = data_train.drop('Survived', axis = 1).drop('train',axis=1)

    t = RandomForestClassifier(n_estimators=20, max_depth=10, min_samples_split=10)
    t = t.fit(X, y)
#     print('depth', [i.tree_.max_depth for i in t.estimators_])
    y = data_test['Survived']
    X = data_test.drop('Survived', axis = 1).drop('train',axis=1)
    print(t.score(X,y))
    
    confusion_matrix = metrics.confusion_matrix(y, t.predict(X))
    print ("Confusion Matrix\n", confusion_matrix)
    err += t.score(X,y) / 10

print('Average score: ',err)

0.768115942029
Confusion Matrix
 [[32  7]
 [ 9 21]]
0.842105263158
Confusion Matrix
 [[40  4]
 [ 8 24]]
0.852941176471
Confusion Matrix
 [[38  6]
 [ 4 20]]
0.837837837838
Confusion Matrix
 [[37  7]
 [ 5 25]]
0.828947368421
Confusion Matrix
 [[43  6]
 [ 7 20]]
0.75
Confusion Matrix
 [[42  4]
 [13  9]]
0.861538461538
Confusion Matrix
 [[33  4]
 [ 5 23]]
0.823529411765
Confusion Matrix
 [[37  6]
 [ 6 19]]
0.728395061728
Confusion Matrix
 [[37  6]
 [16 22]]
0.776119402985
Confusion Matrix
 [[32  3]
 [12 20]]
Average score:  0.806952992593


In [26]:
data['train'] = np.random.uniform(0, 1, len(data))
err = 0
for i in range(10):
    data_test = data[(data['train'] >= i*0.1) & (data['train'] < (i+1)*0.1)]
    data_train = data[(data['train'] < i*0.1) | (data['train'] >= (i+1)*0.1)]

    y = data_train['Survived']
    X = data_train.drop('Survived', axis = 1).drop('train',axis=1)

    t = ExtraTreesClassifier(n_estimators=30, min_samples_split=20, max_leaf_nodes=15)
    t = t.fit(X, y)
#     print('depth', [i.tree_.max_depth for i in t.estimators_])
    y = data_test['Survived']
    X = data_test.drop('Survived', axis = 1).drop('train',axis=1)
    print(t.score(X,y))
    confusion_matrix = metrics.confusion_matrix(y, t.predict(X))
    print ("Confusion Matrix\n", confusion_matrix)
    err += t.score(X,y) / 10

print('Average score: ',err)

0.761194029851
Confusion Matrix
 [[35  5]
 [11 16]]
0.891891891892
Confusion Matrix
 [[48  2]
 [ 6 18]]
0.855072463768
Confusion Matrix
 [[45  4]
 [ 6 14]]
0.787878787879
Confusion Matrix
 [[33  4]
 [10 19]]
0.780821917808
Confusion Matrix
 [[39  3]
 [13 18]]
0.75641025641
Confusion Matrix
 [[44  5]
 [14 15]]
0.793650793651
Confusion Matrix
 [[36  3]
 [10 14]]
0.810810810811
Confusion Matrix
 [[41  3]
 [11 19]]
0.805555555556
Confusion Matrix
 [[39  0]
 [14 19]]
0.763157894737
Confusion Matrix
 [[34  1]
 [17 24]]
Average score:  0.800644440236
