# Amazon Copurchased

Inspired on https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

## Load the libraries

In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.tree import export_graphviz
import pydot

## Read graph

In [2]:
with open('data/links', 'rb') as inf:
    next(inf, '')   # skip a line
    G = nx.read_edgelist(inf, delimiter=',', nodetype=int, encoding="utf-8")

### Calculate nodes centrality measures

#### Degree

In [14]:
degrees = nx.degree(G)

#### Eigenvector centrality

In [None]:
ec = nx.eigenvector_centrality(G)

#### Closeness centrality

In [37]:
# Too slow!
#cc = nx.closeness_centrality(G)

#### Betweenness centrality

In [39]:
# Too slow!
#bc = nx.betweenness_centrality(G)

### Load node properties

In [17]:
df = pd.read_csv('data/nodes')
df['degree'] = None
df['eigenvector_centrality'] = None
for index, row in df.iterrows():
    df.loc[index, 'degree'] = degrees[row['id']]
    df.loc[index, 'eigenvector_centrality'] = ec[row['id']]

### Features summary

In [19]:
df.describe()

Unnamed: 0,id,price,degree
count,6501.0,6501.0,6501.0
mean,3474.740809,44.781375,26.844178
std,2027.154302,31.638002,31.876119
min,1.0,3.9,1.0
25%,1707.0,25.94,6.0
50%,3453.0,36.96,15.0
75%,5232.0,53.24,35.0
max,7040.0,439.9,361.0


## Random forest using degree as feature, price as target

### Preparing data

In [20]:
feature_list = list(df[['degree', 'eigenvector_centrality']].columns)
features = np.array(df[['degree', 'eigenvector_centrality']])
target = np.array(df['price'])

### Average price as baseline

In [21]:
average_target = np.average(target)
print average_target

44.7813751731


### Training data split

In [22]:
# Split the data into training and testing sets
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size = 0.25, random_state = 42)

# Summary
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_target.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_target.shape)

('Training Features Shape:', (4875, 2))
('Training Labels Shape:', (4875,))
('Testing Features Shape:', (1626, 2))
('Testing Labels Shape:', (1626,))


### Train data

In [23]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_target);

### Prediction

In [28]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

('Mean absolute prediction error: ', 22.63, 'R$.')
('Mean absolute error using average: ', 20.37)


### Mean absolute error

In [31]:
# Calculate the absolute errors
errors = abs(predictions - test_target)
errors_baseline = abs(average_target - test_target)
# Print out the mean absolute error (mae)
print('Mean absolute prediction error: R$', round(np.mean(errors), 2))
print('Mean absolute error using average: R$', round(np.mean(errors_baseline), 2))

('Mean absolute prediction error: R$', 22.63)
('Mean absolute error using average: R$', 20.37)


### List a few target vs. predicted

In [34]:
data = {
    "degree": test_features.tolist(),
    "target": test_target,
    "prediction": predictions
}
predicted_df = pd.DataFrame(data = data)
predicted_df['error'] = errors
predicted_df['error_baseline'] = errors_baseline
predicted_df.sort_values('error', ascending = False)
predicted_df.head()

Unnamed: 0,degree,prediction,target,error,error_baseline
0,"[8, 7.08427944431e-05]",33.57577,18.99,14.58577,25.791375
1,"[1, 4.79558626756e-08]",99.394726,136.98,37.585274,92.198625
2,"[25, 0.00485999286852]",34.48372,37.6,3.11628,7.181375
3,"[14, 0.000357045303124]",35.91276,68.46,32.54724,23.678625
4,"[25, 0.00160978649602]",70.29684,17.07,53.22684,27.711375


### Visualize decision tree

In [26]:
# Pull out one tree from the forest
tree = rf.estimators_[0]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')

<img src="files/image.png">