In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import manifold
from sklearn import tree
from sklearn.metrics import accuracy_score as acc_rate
from sklearn.model_selection import train_test_split
import graphviz

In [6]:
data = pd.read_csv('p2p.csv', index_col=0)

In [7]:
data = data.dropna()
y = data.status
data = data.drop(['status', 'nace'], axis=1)

In [8]:
#reduce the data to 2D
X_r, err = manifold.locally_linear_embedding(data.dropna(), n_neighbors=15, n_components=2, random_state=0)

np.random.seed(123)
test_idx = np.random.randint(0, len(X_r), len(X_r) // 3)

In [9]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_r, y.values, test_size=0.3, random_state=123)

In [10]:
# training data
#train_data = np.delete(X_r, test_idx, axis=0)
#train_target = np.delete(y.values, test_idx)

# testing data
#test_data = X_r[test_idx]
#test_target = y.values[test_idx]

In [11]:
# Decision Tree model
clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='best')
clf.fit(X=X_train, y=y_train)
print('\nThe target test data set is:\n', y_test)
print('\nThe predicted result is:\n', clf.predict(X_test))
print('\nAccuracy rate is:\n', acc_rate(y_test, clf.predict(X_test)))


The target test data set is:
 [0. 0. 1. ... 0. 0. 0.]

The predicted result is:
 [0. 0. 0. ... 0. 0. 0.]

Accuracy rate is:
 0.823916898910565


In [13]:
dot_data = tree.export_graphviz(clf,
                                out_file=None,
                                class_names="status",
                                filled=True,
                                rounded=True,
                                impurity=False,
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("p2p", view = True)

'p2p.pdf'