## Machine Learning Approach

In [163]:
import random
import numpy as np
#import igraph
import csv
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import networkx as nx
from sklearn.tree import DecisionTreeRegressor
import math

nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

[nltk_data] Downloading package punkt to /Users/collinli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/collinli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [164]:
#calcualte inner product
def inner(X,Y):
    s = 0
    for i in range(0,len(X)):
        s = s + X[i]*Y[i]
    return s

In [165]:
# read in training set and node information
with open("amazon-meta_item_item_0.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("amazon-meta_item_info.csv", "r") as f:
    reader = csv.reader(f,delimiter='^')
    node_info  = list(reader)

node_info = node_info[1:]
IDs = [element[0] for element in node_info]

# compute TFIDF vector for titles of products
title = [element[1] for element in node_info]
vectorizer = TfidfVectorizer(stop_words="english")
# each row is a node in the order of node_info
features_TFIDF = vectorizer.fit_transform(title)

In [166]:
# read in testing set
with open("amazon-meta_filtered_ground_truth.txt", "r") as f:
    reader = csv.reader(f)
    testing_list  = list(reader)
    
#testing_list = [element[0].split(" ")[0] for element in testing_list]

In [168]:
# randomize training set and split in half 
# half for training and half for testing
to_keep = random.sample(range(len(training_set)), k=int(round(len(training_set))))
training_set_reduced_total = [training_set[i] for i in to_keep]
#create a local test set
training_set_reduced = training_set_reduced_total[:len(training_set_reduced_total)/2]
testing_set = training_set_reduced_total[len(training_set_reduced_total)/2:]
print(len(training_set_reduced))
print(len(testing_set))

262450
262450


In [169]:
#read in graph, each node represents an item
training_set_reduced_total = [training_set[i] for i in to_keep]
G = nx.Graph()
for t in training_set_reduced_total:
    G.add_node(t[0])
    G.add_node(t[1])
    if t[2] == '1':
        G.add_edge(t[0],t[1])

### Training Set

In [170]:
# title TFIDF cos similarity
title_similarity = []

# group is the same or not
group_similarity = []

# number of reviews
reviews_num_similarity = []

# same number of words in detailed category
category_similarity = []

# rating 
rating_similarity = []

counter = 0
for i in xrange(len(training_set_reduced)):
#for i in range(1):
    source = training_set_reduced[i][0]
    target = training_set_reduced[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
	# calculate title TFIDF cos similarity
    source_title_tfidf = features_TFIDF[index_source].toarray()[0]
    target_title_tfidf = features_TFIDF[index_target].toarray()[0]
    
    cos = np.dot(source_title_tfidf,target_title_tfidf) / (np.linalg.norm(np.array(source_title_tfidf))\
       *np.linalg.norm(np.array(target_title_tfidf)))
    title_similarity.append(cos)
    
    #group similarity true or false
    group_source = source_info[2]
    group_target = target_info[2]
    
    if group_source == group_target:
        group_similarity.append(1)
    else:
        group_similarity.append(0)
        
    # number of reviews similarity
    review_source = source_info[4]
    review_target = target_info[4]
    
    reviews_num_similarity.append(np.absolute(int(review_source)-int(review_target)))
    
    # same number of words in detailed category
    category_source = source_info[6+int(review_source)*4:]
    category_target = target_info[6+int(review_target)*4:]
    
    category_similarity.append(len(set(category_source).intersection(set(category_target))))
    
    # rating similarity
    rating_source = source_info[5]
    rating_target = target_info[5]
    
    rating_similarity.append(np.absolute(float(rating_source)-float(rating_target)))
    
    counter += 1
    if counter % 10000 == True:
        print counter, "training examples processsed"

1 training examples processsed
10001 training examples processsed
20001 training examples processsed
30001 training examples processsed
40001 training examples processsed
50001 training examples processsed
60001 training examples processsed
70001 training examples processsed
80001 training examples processsed
90001 training examples processsed
100001 training examples processsed
110001 training examples processsed
120001 training examples processsed
130001 training examples processsed
140001 training examples processsed
150001 training examples processsed
160001 training examples processsed
170001 training examples processsed
180001 training examples processsed
190001 training examples processsed
200001 training examples processsed
210001 training examples processsed
220001 training examples processsed
230001 training examples processsed
240001 training examples processsed
250001 training examples processsed
260001 training examples processsed


In [171]:
# eliminate NaN because of bad tokenizations in titles
# some products have trivial title after tokenization
for i in range(len(title_similarity)):
    if math.isnan(title_similarity[i]):
        title_similarity[i] = 0.0

In [174]:
# sum of nodes' degrees 
node_degree = []

counter = 0
for i in xrange(len(training_set_reduced)):
    source = training_set_reduced[i][0]
    target = training_set_reduced[i][1]
    
    #add node degree
    source_degree = G.degree(source)
    target_degree = G.degree(target)
    if isinstance(source_degree,dict):
        source_degree = 0
    if isinstance(target_degree,dict):
        target_degree = 0
    node_degree.append(source_degree+target_degree)

In [175]:
# distance between two nodes in graph
dist_node = []

counter = 0
node_set = G.nodes()
for i in xrange(len(training_set_reduced)):
    source = training_set_reduced[i][0]
    target = training_set_reduced[i][1]
    if (source in node_set) and (target in node_set) and nx.has_path(G,source,target):
        dist_node.append(-nx.shortest_path_length(G,source,target))
    else:
        dist_node.append(0)

In [176]:
# number of common neighbors
common_neighbors = []

counter = 0
for i in xrange(len(training_set_reduced)):
#for i in range(1):
    source = training_set_reduced[i][0]
    target = training_set_reduced[i][1]
    
    #add node degree
    common_neighbors.append(len(list(nx.common_neighbors(G,source,target))))
    counter += 1
    if counter % 10000 == True:
        print counter, "training examples processsed"

1 training examples processsed
10001 training examples processsed
20001 training examples processsed
30001 training examples processsed
40001 training examples processsed
50001 training examples processsed
60001 training examples processsed
70001 training examples processsed
80001 training examples processsed
90001 training examples processsed
100001 training examples processsed
110001 training examples processsed
120001 training examples processsed
130001 training examples processsed
140001 training examples processsed
150001 training examples processsed
160001 training examples processsed
170001 training examples processsed
180001 training examples processsed
190001 training examples processsed
200001 training examples processsed
210001 training examples processsed
220001 training examples processsed
230001 training examples processsed
240001 training examples processsed
250001 training examples processsed
260001 training examples processsed


In [177]:
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
training_features = np.array([title_similarity, group_similarity, category_similarity,reviews_num_similarity,rating_similarity,node_degree,dist_node,common_neighbors]).T

# scale
training_features = preprocessing.scale(training_features)

In [178]:
# convert labels into integers then into column array
labels = [int(element[2]) for element in training_set_reduced]
labels = list(labels)
labels_array = np.array(labels)

### Various Regressors

In [198]:
# LinearSVC balanced
classifier = svm.LinearSVC(class_weight='balanced')
# train
classifier.fit(training_features, labels_array)

LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [246]:
# LinearSVC 
classifier = svm.LinearSVC()
# train
classifier.fit(training_features, labels_array)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [135]:
# SVC
classifier = svm.SVC()
# train
classifier.fit(training_features, labels_array)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [57]:
# Decision Tree regressor
classifier = DecisionTreeRegressor(max_depth=2)
classifier.fit(training_features, labels_array)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [221]:
#linear regression
theta = np.linalg.lstsq(training_features, labels_array)[0]
print theta

[  3.09495048e-02   2.77555756e-17   3.72653250e-03  -9.03532003e-03
   4.50621094e-03   4.16426381e-02  -4.41159090e-01   1.06134825e-01]


### Testing Set

In [179]:
# for easier measurement of testing results
testing_set = training_set

In [180]:
# test
# we need to compute the features for the testing set

title_similarity_test = []
group_similarity_test = []
reviews_num_similarity_test = []
category_similarity_test = []
rating_similarity_test = []
   
counter = 0
for i in xrange(len(testing_set)):
#for i in range(50):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
	# calculate title TFIDF cos similarity
    source_title_tfidf = features_TFIDF[index_source].toarray()[0]
    target_title_tfidf = features_TFIDF[index_target].toarray()[0]
    
    cos = np.dot(source_title_tfidf,target_title_tfidf) / (np.linalg.norm(np.array(source_title_tfidf))\
       *np.linalg.norm(np.array(target_title_tfidf)))
    title_similarity_test.append(cos)
    
    #group similarity true or false
    group_source = source_info[2]
    group_target = target_info[2]
    
    if group_source == group_target:
        group_similarity_test.append(1)
    else:
        group_similarity_test.append(0)
        
    # number of reviews similarity
    review_source = source_info[4]
    review_target = target_info[4]
    
    reviews_num_similarity_test.append(np.absolute(int(review_source)-int(review_target)))
    
    # same number of words in detailed category
    category_source = source_info[6+int(review_source)*4:]
    category_target = target_info[6+int(review_target)*4:]
    
    category_similarity_test.append(len(set(category_source).intersection(set(category_target))))
    
    # rating similarity
    rating_source = source_info[5]
    rating_target = target_info[5]
    
    rating_similarity_test.append(np.absolute(float(rating_source)-float(rating_target)))
    
    counter += 1
    if counter % 10000 == True:
        print counter, "testing examples processsed"

1 testing examples processsed
10001 testing examples processsed
20001 testing examples processsed
30001 testing examples processsed
40001 testing examples processsed
50001 testing examples processsed
60001 testing examples processsed
70001 testing examples processsed
80001 testing examples processsed
90001 testing examples processsed
100001 testing examples processsed
110001 testing examples processsed
120001 testing examples processsed
130001 testing examples processsed
140001 testing examples processsed
150001 testing examples processsed
160001 testing examples processsed
170001 testing examples processsed
180001 testing examples processsed
190001 testing examples processsed
200001 testing examples processsed
210001 testing examples processsed
220001 testing examples processsed
230001 testing examples processsed
240001 testing examples processsed
250001 testing examples processsed
260001 testing examples processsed
270001 testing examples processsed
280001 testing examples processsed

In [181]:
# eliminate NaN because of bad tokenizations in titles
# some products have trivial title after tokenization
for i in range(len(title_similarity_test)):
    if math.isnan(title_similarity_test[i]):
        title_similarity_test[i] = 0.0

In [182]:
# sum of nodes' degrees 
node_degree_test = []

counter = 0
for i in xrange(len(testing_set)):
#for i in range(1):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    #add node degree
    source_degree = G.degree(source)
    target_degree = G.degree(target)
    if isinstance(source_degree,dict):
        source_degree = 0
    if isinstance(target_degree,dict):
        target_degree = 0
    node_degree_test.append(source_degree+target_degree)

In [183]:
# distance between two nodes in graph
dist_node_test = []

counter = 0
node_set = G.nodes()
for i in xrange(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    if (source in node_set) and (target in node_set) and nx.has_path(G,source,target):
        dist_node_test.append(-nx.shortest_path_length(G,source,target))
    else:
        dist_node_test.append(0)

In [184]:
# number of common neighbors
common_neighbors_test = []

counter = 0
for i in xrange(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    #add node degree
    common_neighbors_test.append(len(list(nx.common_neighbors(G,source,target))))
    counter += 1
    if counter % 10000 == True:
        print counter, "training examples processsed"

1 training examples processsed
10001 training examples processsed
20001 training examples processsed
30001 training examples processsed
40001 training examples processsed
50001 training examples processsed
60001 training examples processsed
70001 training examples processsed
80001 training examples processsed
90001 training examples processsed
100001 training examples processsed
110001 training examples processsed
120001 training examples processsed
130001 training examples processsed
140001 training examples processsed
150001 training examples processsed
160001 training examples processsed
170001 training examples processsed
180001 training examples processsed
190001 training examples processsed
200001 training examples processsed
210001 training examples processsed
220001 training examples processsed
230001 training examples processsed
240001 training examples processsed
250001 training examples processsed
260001 training examples processsed
270001 training examples processsed
280001

In [185]:
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
testing_features = np.array([title_similarity_test, group_similarity_test, category_similarity_test,reviews_num_similarity_test,rating_similarity_test,node_degree_test,dist_node_test,common_neighbors_test]).T

# scale
testing_features = preprocessing.scale(testing_features)

In [247]:
#prediction SVC, Linear SVC
predictions = list(classifier.predict(testing_features))

In [223]:
#prediction Linear regression
result = [int(t[2]) for t in testing_set]
predictions_linear = []
for i in xrange(len(testing_features)):
    r = inner(theta,testing_features[i])
    if r >= 0.46:
        predictions_linear.append(1)
    elif r < 0.46 and r >= -0.15:
        predictions_linear.append(0)
    else:
        predictions_linear.append(-1)
predictions = predictions_linear

### Measurements

In [248]:
# Measurement 1, classification accuracy
result = [int(t[2]) for t in testing_set]
compare = zip(result,predictions)
accuracy = 0
for c in compare:
    if c[0] == c[1]:
        accuracy += 1
accuracy = accuracy*1.0 / len(compare)
print(accuracy)

0.999176986093


In [249]:
# Measurement 2, label 1 classification accuracy
result = [int(t[2]) for t in testing_set]
compare = zip(result,predictions)
accuracy = 0
for c in compare:
    if c[0] == c[1] and c[0] == 1:
        accuracy += 1
print(accuracy*1.0 / (2*len(G.edges())))

0.826439578264


In [254]:
# strictly top5 or top10 according to score Measurement 3,4
# change values of K for training
K = 5
all_rank = zip(testing_set,predictions)
top5_list = []
l = len(G.nodes())
for i in range(l):
    partial_rank = all_rank[i*(l-1):i*(l-1)+l-2]
    partial_rank = sorted(partial_rank,key=lambda x:-x[1])
    partial_rank = partial_rank[:K]
    tmp = [partial_rank[0][0][0]]
    for j in range(K):
        tmp.append(partial_rank[j][0][1])
    top5_list.append(tmp)
top5_list = [t for t in top5_list if len(t) > 1]

In [256]:
# calculate recommendation accuracies
accuracy = []
counter = 0
accuracy_sum = 0
for i in range(len(top5_list)):
    #ground_truth_top5 = testing_list[i]
    #ground_truth_set = set(ground_truth_top5[1:])
    prediction_top5 = top5_list[i]
    prediction_set = set(prediction_top5[1:])
    
    ground_truth_top5 = [element for element in testing_list if element[0] == prediction_top5[0]][0]
    ground_truth_set = set(ground_truth_top5[1:])
    if len(ground_truth_set) == 0:
        accuracy.append(2.0)
    else:
        accuracy_t = len(ground_truth_set.intersection(prediction_set))*1.0\
                     / len(ground_truth_set)
        accuracy.append([ground_truth_top5[0],accuracy_t])
        accuracy_sum += accuracy_t
        counter += 1

In [257]:
accuracy_sum / counter

0.5931714719271625

In [92]:
# all items satisfied (not used)
all_rank = zip(testing_set,predictions)
top5_list = []
l = len(G.nodes())
for i in range(l):
    partial_rank = all_rank[i*(l-1):i*(l-1)+l-2]
    tmp = [partial_rank[0][0][0]]
    partial_rank = [p for p in partial_rank if p[1] == 1.0]
    for j in range(len(partial_rank)):
        tmp.append(partial_rank[j][0][1])
    top5_list.append(tmp)