In [1]:
import dgl
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import json
from py2neo import Graph,Node,NodeMatcher
import pandas as pd
import dgl.function as fn
import numpy as np
from __future__ import division, print_function
import progressbar
import matplotlib.pyplot as plt
import shelve
from base.GCN.GraphSAGE import GCN,Classifier,Model
from base.GCN.XGboost import XGBoost
graph = Graph("http://47.95.159.86/:7474",auth=("neo4j","06240118"))

Using backend: pytorch


In [2]:
# 获取国家体育馆4跳之内的路名节点
query = "match (p:gym {name:'国家体育馆'})-[edge*1..4]->(q:road) return id(q) as qid,q.name as name;"
res = graph.run(query).data()
nodesid = set()
name2id = {}
id2name = {}
for row in res:
    nodesid.add(row['qid'])
    name2id[row['name']] = row['qid']
    id2name[row['qid']] = row['name']

In [3]:
# 找到各个路段
query = "match (p:road)-->(q:road) where id(q) in " + str(list(nodesid)) + " and id(p) in " + str(list(nodesid)) + " return id(p) as pid,id(q) as qid;"
res = graph.run(query).data()
road_starts = []
road_ends = []
for row in res:
    road_starts.append(row['pid'])
    road_ends.append(row['qid'])


In [4]:
# 找到与gym直接相连的路段
query = "match (p:gym)-->(q:road) where id(p)=0 return id(p) as pid,id(q) as qid;"
res = graph.run(query).data()
gym_starts = []
gym_ends = []
for row in res:
    gym_starts.append(row['pid'])
    gym_ends.append(row['qid'])


In [5]:
# 找到路段默认等级、限速
level2speed = {"1":100,"2":100,"3":80,"4":80,"5":60,"6":60}
query = "match (p:road) where id(p) in " + str(list(nodesid)) + " return id(p) as pid,p.road_level as level;"
res = graph.run(query).data()
id2level = {}
for row in res:
    id2level[row['pid']] = row['level']


In [6]:
# 构建gym和road两类节点的异质图
graph_data = {
   ('road', 'direct', 'gym'): (torch.tensor(gym_ends),torch.tensor(gym_starts)),
   ('road', 'link', 'road'): (torch.tensor(road_starts), torch.tensor(road_ends))
}
g = dgl.heterograph(graph_data)


In [7]:
# 数据组成
data_file = "./data.json"
data = []
label = []
with open(data_file,'r') as fd:
        content = json.load(fd)
        for item in content:
            dic = {}
            for key in item.keys():
                if key in name2id:
                    dic[name2id[key]] = item[key]
                if key == 'label':
                    label.append(item[key])
            data.append(dic)

In [8]:
# 按id顺序组装数据
x_data = []  # data_num * node_num
NODE_NUM = len(g.nodes('road'))
ITEM_LEN = 6
for item in data:
    data_item = []
    for i in range(NODE_NUM):
        if i in item:
            if i in id2level:
                new_item = [x if x != -1 else level2speed[id2level[i]] for x in item[i]]
            else:
                new_item = [60 for x in item[i]]
            data_item.append(new_item)
        else:
            if i in id2level:
                default_item = [level2speed[id2level[i]] for _ in range(ITEM_LEN)]
            else:
                default_item = [60 for _ in range(ITEM_LEN)]
            data_item.append(default_item)
    x_data.append(data_item)

In [9]:
input_data = torch.FloatTensor(x_data)
_train_labels = [1,2,3]
onehot_encoded = list()
for value in label:
    letter = [0 for _ in range(len(_train_labels))]
    letter[value-1] = 1
    onehot_encoded.append(letter)

In [10]:
for i in range(input_data.shape[0]):
    g.nodes['road'].data['speed'] = input_data[i]
    funcs = {}
    funcs['link'] = (fn.copy_u('speed', 'm'), fn.mean('m', 'h_1'))
    g.multi_update_all(funcs, 'sum')
    funcs = {}
    funcs['link'] = (fn.copy_u('h_1', 'm'), fn.mean('m', 'h_2'))
    g.multi_update_all(funcs, 'sum')
    funcs = {}
    funcs['link'] = (fn.copy_u('h_2', 'm'), fn.mean('m', 'h_3'))
    g.multi_update_all(funcs, 'sum')
    funcs = {}
    funcs['direct'] = (fn.copy_u('h_3', 'f'), fn.mean('f', 'level'))
    g.multi_update_all(funcs, 'sum')
    level = torch.sum(g.nodes['gym'].data['level'] - 70)
    traffic_level = 1
    if float(level) < 13:
        traffic_level = 2
    if float(level) < 12:
        traffic_level = 3
    label[i] = traffic_level
print(label.count(1),label.count(2),label.count(3))

462 220 7


In [11]:
res = []
for v in label:
    item = []
    item.append(v-1)
    res.append(item)
train_label = torch.LongTensor(res)

In [12]:
# class GCN(nn.Module):
#     def __init__(self):
#         super(GCN, self).__init__()
#         self.fc1 = nn.Linear(12, 12)
#         self.fc2 = nn.Linear(12, 6)
#         self.sc1 = nn.Linear(12, 6)
#         self.bn1 = nn.BatchNorm1d(12)
#         self.ac1 = nn.LeakyReLU()

#         self.fc3 = nn.Linear(12, 12)
#         self.fc4 = nn.Linear(12, 6)
#         self.sc2 = nn.Linear(12, 6)
#         self.bn2 = nn.BatchNorm1d(12)
#         self.ac2 = nn.LeakyReLU()
        
#         self.fc5 = nn.Linear(12, 12)
#         self.fc6 = nn.Linear(12, 6)
#         self.sc3 = nn.Linear(12, 6)
#         self.bn3 = nn.BatchNorm1d(12)
#         self.ac3 = nn.LeakyReLU()
#     def forward(self, g, h):
#         g.nodes['road'].data['speed'] = h
        
#         funcs = {}
#         funcs['link'] = (fn.copy_u('speed', 'm'), fn.mean('m', 'h_1'))
#         g.multi_update_all(funcs, 'mean')
        
#         h1_ = g.ndata['h_1']['road']
        
#         h1 = self.fc1(torch.cat([h, h1_], dim=1))
#         x = h1
#         h1 = self.bn1(h1)
#         h1 = self.ac1(h1)
#         h1 = self.fc2(h1) + self.sc1(x)
        
        
#         g.nodes['road'].data['h_1'] = h1
#         funcs = {}
#         funcs['link'] = (fn.copy_u('h_1', 'm'), fn.mean('m', 'h_2'))
#         g.multi_update_all(funcs, 'mean')
        
#         h2_ = g.ndata['h_2']['road']
        
#         h2 = self.fc3(torch.cat([h1, h2_], dim=1))
#         x = h2
#         h2 = self.bn2(h2)
#         h2 = self.ac2(h2)
#         h2 = self.fc4(h2) + self.sc2(x)
        
#         g.nodes['road'].data['h_2'] = h2
#         funcs = {}
#         funcs['link'] = (fn.copy_u('h_2', 'm'), fn.mean('m', 'h_3'))
#         g.multi_update_all(funcs, 'mean')
        
#         h3_ = g.ndata['h_3']['road']
        
#         h3 = self.fc5(torch.cat([h2, h3_], dim=1))
#         x = h3
#         h3 = self.bn3(h3)
#         h3 = self.ac3(h3)
#         h3 = self.fc6(h3) + self.sc3(x)
        
#         g.nodes['road'].data['h_3'] = h3
#         funcs = {}
#         funcs['direct'] = (fn.copy_u('h_3', 'f'), fn.mean('f', 'level'))
#         g.multi_update_all(funcs, 'sum')
        
#         return g.nodes['gym'].data['level']
    
# class Classifier(nn.Module):
#     def __init__(self):
#         super(Classifier, self).__init__()
#         self.classifier = nn.Linear(6, 3)
#     def forward(self,h):
#         res = self.classifier(h)
#         return res
    
# class Model(nn.Module):
#     def __init__(self,gcn,classifier):
#         super(Model, self).__init__()
#         self.gcn = gcn
#         self.classifier = classifier
#     def forward(self,g,h):
#         res = self.classifier(self.gcn(g,h))
#         return res

In [13]:
def train(epoch,g, model,input_data,input_label):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.1)
    p_num = 0
    t_num = 0
    pre_acc = 0
    for ep in range(epoch):
        for e in range(input_data.shape[0]):
            # Forward
            logits = model(g, input_data[e])

            # Compute prediction
            pred = logits.argmax(1)

            # Compute loss
            # Note that we should only compute the losses of the nodes in the training set,
            # i.e. with train_mask 1.
            loss = F.cross_entropy(logits, input_label[e])

            # Compute accuracy on training/validation/test
            train_acc = (pred == train_label[e]).float().mean()
            p_num += (pred == train_label[e]).float()
            t_num += 1


            # Backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        acc = float(p_num / t_num * 100)
        print('acc: {:.3f}%'.format(acc))
    torch.save(model.state_dict(), './model/GraphSAGE_pram.pkl')
    print("better model is saved")

In [14]:
gcn = GCN()
classifier = Classifier()
model = Model(gcn,classifier)

In [15]:
train(100,g,model,input_data,train_label)

acc: 67.344%
acc: 72.424%
acc: 69.473%
acc: 67.671%
acc: 66.589%
acc: 66.304%
acc: 66.038%
acc: 65.729%
acc: 66.280%
acc: 66.183%
acc: 66.117%
acc: 65.989%
acc: 66.071%
acc: 65.986%
acc: 65.893%
acc: 65.620%
acc: 65.440%
acc: 65.602%
acc: 65.396%
acc: 65.174%
acc: 65.747%
acc: 65.662%
acc: 65.615%
acc: 65.469%
acc: 65.393%
acc: 65.239%
acc: 65.237%
acc: 65.131%
acc: 65.052%
acc: 64.998%
acc: 65.190%
acc: 65.208%
acc: 65.171%
acc: 65.500%
acc: 65.354%
acc: 65.393%
acc: 65.469%
acc: 65.465%
acc: 65.364%
acc: 65.370%
acc: 65.372%
acc: 65.305%
acc: 65.356%
acc: 65.540%
acc: 65.506%
acc: 65.498%
acc: 65.436%
acc: 65.418%
acc: 65.386%
acc: 65.405%
acc: 65.528%
acc: 65.600%
acc: 65.619%
acc: 65.629%
acc: 65.645%
acc: 65.584%
acc: 65.549%
acc: 65.505%
acc: 65.499%
acc: 65.416%
acc: 65.388%
acc: 65.385%
acc: 65.379%
acc: 65.507%
acc: 65.538%
acc: 65.651%
acc: 65.776%
acc: 65.829%
acc: 65.853%
acc: 65.884%
acc: 65.839%
acc: 65.800%
acc: 65.759%
acc: 65.832%
acc: 65.811%
acc: 65.770%
acc: 65.861%

In [16]:
mid_data = []
for x in range(input_data.shape[0]):
    logits = model(g, input_data[x])
    pred = logits.argmax(1)
#     print(pred,label[x])
    level = gcn(g, input_data[x])
    mid_data.append(level[0].tolist())
for lis in mid_data:
    print(sum(lis))

-7.769532233476639
-7.8545286655426025
-7.832947313785553
-7.800883859395981
-7.790611028671265
-7.793244004249573
-7.818482607603073
-7.838344186544418
-7.875929981470108
-7.868527680635452
-7.862983614206314
-7.866086393594742
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.849625051021576
-7.829213082790375
-7.889557600021362
-7.838497459888458
-7.879251480102539
-7.882717281579971
-7.850777834653854
-7.912915229797363
-7.865494638681412
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.785492449998856
-7.789125204086304
-7.792875200510025
-7.785986661911011
-7.842447102069855
-7.879643529653549
-7.871657699346542
-7.808419167995453
-7.79306373000145
-7.8910729587078094
-7.835995674133301
-7.797374576330185
-7.80087479

In [17]:
# load_model = torch.load("./model/GraphSAGE.pkl")
gcn = GCN()
classifier = Classifier()
load_model = Model(gcn,classifier)
load_model.load_state_dict(torch.load("./model/GraphSAGE_pram.pkl"))
# load_model.eval()
for x in range(input_data.shape[0]):
    level = load_model.gcn(g, input_data[x])
    print(sum(level[0].tolist()))

-7.769532233476639
-7.8545286655426025
-7.832947313785553
-7.800883859395981
-7.790611028671265
-7.793244004249573
-7.818482607603073
-7.838344186544418
-7.875929981470108
-7.868527680635452
-7.862983614206314
-7.866086393594742
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.849625051021576
-7.829213082790375
-7.889557600021362
-7.838497459888458
-7.879251480102539
-7.882717281579971
-7.850777834653854
-7.912915229797363
-7.865494638681412
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.902949333190918
-7.785492449998856
-7.789125204086304
-7.792875200510025
-7.785986661911011
-7.842447102069855
-7.879643529653549
-7.871657699346542
-7.808419167995453
-7.79306373000145
-7.8910729587078094
-7.835995674133301
-7.797374576330185
-7.80087479

In [18]:
# bar_widgets = [
#     'Training: ', progressbar.Percentage(), ' ', progressbar.Bar(marker="-", left="[", right="]"),
#     ' ', progressbar.ETA()
# ]

def mean_squared_error(y_true, y_pred):
    """ Returns the mean squared error between y_true and y_pred """
    mse = np.mean(np.power(y_true - y_pred, 2))
    return mse

def accuracy_score(y_true, y_pred):
    """ Compare y_true to y_pred and return the accuracy """
    
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
    """ Split the data into train and test sets """
    if shuffle:
        X, y = shuffle_data(X, y, seed)
    # Split the training data from test data in the ratio specified in
    # test_size
    split_i = len(y) - int(len(y) // (1 / test_size))
    X_train, X_test = X[:split_i], X[split_i:]
    y_train, y_test = y[:split_i], y[split_i:]

    return X_train, X_test, y_train, y_test

# def divide_on_feature(X, feature_i, threshold):
#     """ Divide dataset based on if sample value on feature index is larger than
#         the given threshold """
#     split_func = None
#     if isinstance(threshold, int) or isinstance(threshold, float):
#         split_func = lambda sample: sample[feature_i] >= threshold
#     else:
#         split_func = lambda sample: sample[feature_i] == threshold

#     X_1 = np.array([sample for sample in X if split_func(sample)])
#     X_2 = np.array([sample for sample in X if not split_func(sample)])

#     return np.array([X_1, X_2])

def shuffle_data(X, y, seed=None):
    """ Random shuffle of the samples in X and y """
    if seed:
        np.random.seed(seed)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    return X[idx], y[idx]
def pred2label(old):
    new = []
    for v in old:
        new.append(round(v[0]))
    return np.array(new)

In [19]:
# class DecisionNode():
#     """Class that represents a decision node or leaf in the decision tree

#     Parameters:
#     -----------
#     feature_i: int
#         Feature index which we want to use as the threshold measure.
#     threshold: float
#         The value that we will compare feature values at feature_i against to
#         determine the prediction.
#     value: float
#         The class prediction if classification tree, or float value if regression tree.
#     true_branch: DecisionNode
#         Next decision node for samples where features value met the threshold.
#     false_branch: DecisionNode
#         Next decision node for samples where features value did not meet the threshold.
#     """

#     def __init__(self, feature_i=None, threshold=None,
#                  value=None, true_branch=None, false_branch=None):
#         self.feature_i = feature_i  # Index for the feature that is tested
#         self.threshold = threshold  # Threshold value for feature
#         self.value = value  # Value if the node is a leaf in the tree
#         self.true_branch = true_branch  # 'Left' subtree
#         self.false_branch = false_branch  # 'Right' subtree
        
# class DecisionTree(object):
#     """Super class of RegressionTree and ClassificationTree.

#     Parameters:
#     -----------
#     min_samples_split: int
#         The minimum number of samples needed to make a split when building a tree.
#     min_impurity: float
#         The minimum impurity required to split the tree further.
#     max_depth: int
#         The maximum depth of a tree.
#     loss: function
#         Loss function that is used for Gradient Boosting models to calculate impurity.
#     """

#     def __init__(self, min_samples_split=2, min_impurity=1e-7,
#                  max_depth=float("inf"), loss=None):
#         self.root = None  # Root node in dec. tree
#         # Minimum n of samples to justify split
#         self.min_samples_split = min_samples_split
#         # The minimum impurity to justify split
#         self.min_impurity = min_impurity
#         # The maximum depth to grow the tree to
#         self.max_depth = max_depth
#         # Function to calculate impurity (classif.=>info gain, regr=>variance reduct.)
#         # 切割树的方法，gini，方差等
#         self._impurity_calculation = None
#         # Function to determine prediction of y at leaf
#         # 树节点取值的方法，分类树：选取出现最多次数的值，回归树：取所有值的平均值
#         self._leaf_value_calculation = None
#         # If y is one-hot encoded (multi-dim) or not (one-dim)
#         self.one_dim = None
#         # If Gradient Boost
#         self.loss = loss

#     def fit(self, X, y, loss=None):
#         """ Build decision tree """
#         self.one_dim = len(np.shape(y)) == 1
#         self.root = self._build_tree(X, y)
#         self.loss = None

#     def _build_tree(self, X, y, current_depth=0):
#         """ Recursive method which builds out the decision tree and splits X and respective y
#         on the feature of X which (based on impurity) best separates the data"""
#         largest_impurity = 0
#         best_criteria = None  # Feature index and threshold
#         best_sets = None  # Subsets of the data

#         # Check if expansion of y is needed  [1,2,3,4,5] -> [[1],[2],[3],[4],[5]]
#         if len(np.shape(y)) == 1:
#             y = np.expand_dims(y, axis=1)

#         # Add y as last column of X
#         Xy = np.concatenate((X, y), axis=1)

#         n_samples, n_features = np.shape(X)
#         # 节点样本数大于节点阈值并且没有达到最大树深
#         if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
#             # Calculate the impurity for each feature
#             for feature_i in range(n_features):
#                 # All values of feature_i
#                 feature_values = np.expand_dims(X[:, feature_i], axis=1)
#                 unique_values = np.unique(feature_values)

#                 # Iterate through all unique values of feature column i and
#                 # calculate the impurity
#                 for threshold in unique_values:
#                     # Divide X and y depending on if the feature value of X at index feature_i
#                     # meets the threshold
#                     # 根据分割值分割样本
#                     Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)

#                     if len(Xy1) > 0 and len(Xy2) > 0:
#                         # Select the y-values of the two sets
#                         # 拿到两部分的label
#                         y1 = Xy1[:, n_features:]
#                         y2 = Xy2[:, n_features:]

#                         # Calculate impurity
#                         impurity = self._impurity_calculation(y, y1, y2)

#                         # If this threshold resulted in a higher information gain than previously
#                         # recorded save the threshold value and the feature
#                         # index
#                         if impurity > largest_impurity:
#                             largest_impurity = impurity
#                             best_criteria = {"feature_i": feature_i, "threshold": threshold}
#                             best_sets = {
#                                 "leftX": Xy1[:, :n_features],  # X of left subtree
#                                 "lefty": Xy1[:, n_features:],  # y of left subtree
#                                 "rightX": Xy2[:, :n_features],  # X of right subtree
#                                 "righty": Xy2[:, n_features:]  # y of right subtree
#                             }

#         if largest_impurity > self.min_impurity:
#             # Build subtrees for the right and left branches
#             true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1)
#             false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1)
#             return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
#                 "threshold"], true_branch=true_branch, false_branch=false_branch)

#         # We're at leaf => determine value
#         leaf_value = self._leaf_value_calculation(y)
#         return DecisionNode(value=leaf_value)

#     def predict_value(self, x, tree=None):
#         """ Do a recursive search down the tree and make a prediction of the data sample by the
#             value of the leaf that we end up at """

#         if tree is None:
#             tree = self.root

#         # If we have a value (i.e we're at a leaf) => return value as the prediction
#         if tree.value is not None:
#             return tree.value

#         # Choose the feature that we will test
#         feature_value = x[tree.feature_i]

#         # Determine if we will follow left or right branch
#         branch = tree.false_branch
#         if isinstance(feature_value, int) or isinstance(feature_value, float):
#             if feature_value >= tree.threshold:
#                 branch = tree.true_branch
#         elif feature_value == tree.threshold:
#             branch = tree.true_branch

#         # Test subtree
#         return self.predict_value(x, branch)

#     def predict(self, X):
#         """ Classify samples one by one and return the set of labels """
#         y_pred = []
#         for x in X:
#             y_pred.append(self.predict_value(x))
#         return y_pred

#     def print_tree(self, tree=None, indent=" "):
#         """ Recursively print the decision tree """
#         if not tree:
#             tree = self.root

#         # If we're at leaf => print the label
#         if tree.value is not None:
#             print(tree.value)
#         # Go deeper down the tree
#         else:
#             # Print test
#             print("%s:%s? " % (tree.feature_i, tree.threshold))
#             # Print the true scenario
#             print("%sT->" % (indent), end="")
#             self.print_tree(tree.true_branch, indent + indent)
#             # Print the false scenario
#             print("%sF->" % (indent), end="")
#             self.print_tree(tree.false_branch, indent + indent)

# class LeastSquaresLoss():
#     """Least squares loss"""

#     # g是loss一阶导
#     def gradient(self, actual, predicted):
#         return actual - predicted

#     # h是loss二阶导
#     def hess(self, actual, predicted):
#         return np.ones_like(actual)

# class XGBoostRegressionTree(DecisionTree):
#     """
#     Regression tree for XGBoost
#     - Reference -
#     http://xgboost.readthedocs.io/en/latest/model.html
#     """

#     def _split(self, y):
#         """ y contains y_true in left half of the middle column and
#         y_pred in the right half. Split and return the two matrices """
#         col = int(np.shape(y)[1]/2)
#         y, y_pred = y[:, :col], y[:, col:]
#         return y, y_pred

#     def _gain(self, y, y_pred):
#         nominator = np.power((self.loss.gradient(y, y_pred)).sum(), 2)
#         denominator = self.loss.hess(y, y_pred).sum()
#         return 0.5 * (nominator / denominator)

#     def _gain_by_taylor(self, y, y1, y2):
#         # Split
#         y, y_pred = self._split(y)
#         y1, y1_pred = self._split(y1)
#         y2, y2_pred = self._split(y2)

#         true_gain = self._gain(y1, y1_pred)
#         false_gain = self._gain(y2, y2_pred)
#         gain = self._gain(y, y_pred)
#         return true_gain + false_gain - gain

#     def _approximate_update(self, y):
#         # y split into y, y_pred
#         y, y_pred = self._split(y)
#         gradient = np.sum(self.loss.gradient(y, y_pred),axis=0)
#         hessian = np.sum(self.loss.hess(y, y_pred), axis=0)
#         update_approximation =  gradient / hessian
#         return update_approximation


#     def fit(self, X, y):
#         self._impurity_calculation = self._gain_by_taylor
#         self._leaf_value_calculation = self._approximate_update
#         super(XGBoostRegressionTree, self).fit(X, y)




# class XGBoost(object):
#     """The XGBoost classifier.

#     Reference: http://xgboost.readthedocs.io/en/latest/model.html

#     Parameters:
#     -----------
#     n_estimators: int
#         The number of classification trees that are used.
#     learning_rate: float
#         The step length that will be taken when following the negative gradient during
#         training.
#     min_samples_split: int
#         The minimum number of samples needed to make a split when building a tree.
#     min_impurity: float
#         The minimum impurity required to split the tree further.
#     max_depth: int
#         The maximum depth of a tree.
#     """

#     def __init__(self, n_estimators=200, learning_rate=0.01, min_samples_split=2,
#                  min_impurity=1e-7, max_depth=2):
#         self.n_estimators = n_estimators  # Number of trees
#         self.learning_rate = learning_rate  # Step size for weight update
#         self.min_samples_split = min_samples_split  # The minimum n of sampels to justify split
#         self.min_impurity = min_impurity  # Minimum variance reduction to continue
#         self.max_depth = max_depth  # Maximum depth for tree

#         self.bar = progressbar.ProgressBar(widgets=bar_widgets)

#         # Log loss for classification
#         self.loss = LeastSquaresLoss()

#         # Initialize regression trees
#         self.trees = []
#         for _ in range(n_estimators):
#             tree = XGBoostRegressionTree(
#                 min_samples_split=self.min_samples_split,
#                 min_impurity=min_impurity,
#                 max_depth=self.max_depth,
#                 loss=self.loss)

#             self.trees.append(tree)

#     def fit(self, X, y):
#         # y = to_categorical(y)
#         m = X.shape[0]
#         y = np.reshape(y, (m, -1))
#         y_pred = np.zeros(np.shape(y))
#         for i in self.bar(range(self.n_estimators)):
#             tree = self.trees[i]
#             y_and_pred = np.concatenate((y, y_pred), axis=1)
#             tree.fit(X, y_and_pred)
#             update_pred = tree.predict(X)
#             update_pred = np.reshape(update_pred, (m, -1))
#             y_pred += update_pred

#     def predict(self, X):
#         y_pred = None
#         m = X.shape[0]
#         # Make predictions
#         for tree in self.trees:
#             # Estimate gradient and update prediction
#             update_pred = tree.predict(X)
#             update_pred = np.reshape(update_pred, (m, -1))
#             if y_pred is None:
#                 y_pred = np.zeros_like(update_pred)
#             y_pred += update_pred

#         return y_pred


In [20]:
def main():
    print ("-- XGBoost --")
    mid_data = np.array(mid_data)
    X_train, X_test, y_train, y_test = train_test_split(mid_data, label, test_size=0.5)
    print(y_train)
#     model = XGBoost()
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     y_pred_line = model.predict(X)
#     print(y_test[0:5])
#     # Color map
#     cmap = plt.get_cmap('viridis')

#     mse = mean_squared_error(y_test, y_pred)

#     print ("Mean Squared Error:", mse)


In [21]:
print ("-- XGBoost --")
xg_data = np.array(mid_data)
xg_label = np.array(label)
X_train, X_test, y_train, y_test = train_test_split(xg_data, xg_label, test_size=0.1)
xg_model = XGBoost()
xg_model.fit(X_train, y_train)




Training:   0% [                                               ] ETA:  --:--:--

-- XGBoost --


  return np.array([X_1, X_2])
Training: 100% [------------------------------------------------] Time: 0:10:55


In [22]:
y_tra = xg_model.predict(X_train)

y_pred = xg_model.predict(X_test)

mse_train = mean_squared_error(y_train, y_tra)
mse_test = mean_squared_error(y_test, y_pred)

acc_train = accuracy_score(y_train, pred2label(y_tra))
acc_test = accuracy_score(y_test, pred2label(y_pred))


print ("Train Mean Squared Error:{:.3f},acc:{:.3f}".format(float(mse_train),float(acc_train)))
print ("Test Mean Squared Error:{:.3f},acc:{:.3f}".format(float(mse_test),float(acc_test)))

Train Mean Squared Error:0.481,acc:1.000
Test Mean Squared Error:0.581,acc:0.971


In [23]:
y_pred = xg_model.predict(X_test)
print(y_test)
y_pd = []
for v in y_pred:
    y_pd.append(round(v[0]))
print(y_pd)

[1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 2 3 1 1 1 1 2 1 1 2 2 2 1
 1 1 2 1 1 1 1 1 1 2 2 1 1 1 1 1 2 1 2 1 2 1 1 1 1 2 1 2 2 1 3]
[1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 3]


In [24]:
# 存入
def save(model):                      
    db = shelve.open('./model/xg_boost')       # 创建二进制文件  赋值句柄给db
    db['model'] = model         # 把实例化后的对象s赋值给db key为s
    db.close()          # 保险起见  关闭一下文件 shelve自带的方法  .close()


# 读取
def read_shelve():
    db = shelve.open('./model/xg_boost')           # 打开文件  赋值句柄给db
    model = db['model']        # 把db['s']的值取出来给st
    return model

In [25]:
save(xg_model.trees)

In [None]:
from base.GCN.LevelAnalysisModel import ATAnalysisModel
from base.GCN.XGBoostTree import XGBoostRegressionTree
from base.GCN.DecisionTree import DecisionTree
from base.GCN.DecisionNode import DecisionNode

In [None]:
predict_model = ATAnalysisModel()
predict_model.buildGraph(graph)
ans = []
for x in range(input_data.shape[0]):
    res = predict_model.predict(input_data[x])
    ans.append(res)
xg_label = np.array(label)
print(accuracy_score(np.array(label),np.array(ans)))
print(ans)

In [None]:
predict_model.build_data()
for item in predict_model.input_data[0]:
    print(item)

In [None]:
data_dic = {'京藏高速':22.1}
predict_model.update_data(data_dic)
for item in predict_model.input_data[0]:
    print(item)

In [None]:
# 数据组成
data_file = "./at_default_data.json"
data = []
with open(data_file,'r') as fd:
        content = json.load(fd)
        dic = {}
        for item in content:
            
            if item in name2id:
                dic[name2id[item]] = content[item]
        data.append(dic)

In [None]:
# 按id顺序组装数据
x_data = []  # data_num * node_num
NODE_NUM = len(g.nodes('road'))
ITEM_LEN = 6
for item in data:
    print(item)
    data_item = []
    for i in range(NODE_NUM):
        if i in item:
            if i in id2level:
                new_item = [x if x != -1 else level2speed[id2level[i]] for x in item[i]]
            else:
                new_item = [60 for x in item[i]]
            data_item.append(new_item)
        else:
            if i in id2level:
                default_item = [level2speed[id2level[i]] for _ in range(ITEM_LEN)]
            else:
                default_item = [60 for _ in range(ITEM_LEN)]
            data_item.append(default_item)
    x_data.append(data_item)

In [None]:
input_data = torch.FloatTensor(x_data)
data_np = input_data.numpy()

In [None]:
print(data_np)

In [None]:
array = np.array([1,2,3,4,5])
num = -1
fill_value = 0

def shift(arr, num, fill_value):
    result = np.empty_like(arr)
    if num > 0:
        result[:][:num] = fill_value
        result[:][num:] = arr[:][:-num]
    elif num < 0:
        result[:][num:] = fill_value
        result[:][:num] = arr[:][-num:]
    else:
        result = arr
    print(result)
shift(array, num, fill_value)

In [None]:

print(data_np[:,:,1:data_np.shape[2]].shape)
print(name2id)

In [None]:
data_dic = {'京藏高速':22.1}
np_array = input_data.numpy()
old = np_array[:,:,:np_array.shape[2] - 1]
data = []
NODE_NUM = len(g.nodes('road'))
for i in range(NODE_NUM):
    if i in id2level:
        new_item = level2speed[id2level[i]]
    else:
        new_item = 60
    data.append(new_item)
for key in data_dic:
    id = name2id[key]
    data[id] = data_dic[key]
new = np.expand_dims(np.expand_dims(np.array(data),axis=0),axis=-1)
res = np.concatenate([old, new], axis=2)
for i in res[0]:
    print(i)