In [15]:
import pandas as pd
import numpy as np
import collections
import networkx as nx
import matplotlib.pyplot as plt
from seaborn import color_palette, set_style, palplot
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier

In [16]:
f=open("communities", "r")
f1 = f.readlines()

#construct community dictionary, each user -> community
community_dict = {}
communities = set()
for line in f1:
    community , ids = line.split(':')
    communities.add(community)
    ids = ids.strip('\n')
    ids = list(int(x) for x in ids.split(','))
    for id_ in ids:
        community_dict[id_] = community

print("There are in total {} communities".format(len(communities)))

There are in total 28 communities


In [17]:
f = open("list.features", "r")
f1 = f.readlines()

#construct list.feature dictionary list_id -> feature
list_dict = {}

for i, feature in enumerate(f1):
    list_dict[i] = feature

In [18]:
#construct a commuity graph (undirected-graph) 
user_feature_graph = nx.Graph()

f = open("lists-to-community.txt", "r")
f1 = f.readlines()

for line in f1:
    node1, node2 = line.split(' ')
    node1, node2 = int(node1) , int(node2)
    user_feature_graph.add_edge(node1, node2)
    
#graph with community and its features

In [19]:
#This part is for Bag of Words
#Function: tags_to_BOW()
def get_one_hot_tokens(user_feature_graph, community_dict):
    all_ = set()
    for node in user_feature_graph.nodes():
        all_.add(node)
        
    for community in community_dict.keys():
        if community in all_:
            all_.remove(community)
    
    #all preserve the items in list_
    all_ = list(all_)
     
    dictionary = {}

    
    for i, list_ in enumerate(all_):
        dictionary[list_] = i

    #this is the look up dictionaty for index that needs to add 1
    return dictionary


def one_hot_encoding(user_feature_graph, community_dict, one_hot_index):
    user_to_one_hot = {}
    
     #this is a dictionary that tells you which index should +1
    for user in community_dict.keys():  
        if user in user_feature_graph.nodes():
            one_hot_vector = np.zeros(len(one_hot_index))        
            for neighbors in user_feature_graph.neighbors(user):
                if neighbors in one_hot_index:
                    one_hot_vector[one_hot_index[neighbors]] += 1
            user_to_one_hot[user] = one_hot_vector
    
    return user_to_one_hot


def community_to_one_hot(user_feature_graph, community_dict):
    one_hot_index = get_one_hot_tokens(user_feature_graph, community_dict)
    
    #Make the tags in the format of BOW
    community_one_hot = one_hot_encoding(user_feature_graph, community_dict, one_hot_index)

    return community_one_hot


In [20]:
#MAIN


community_one_hot = community_to_one_hot(user_feature_graph, community_dict)

#X= one-hot, Y = community_according_to_label

label = {"archery":0, "athletics":1, "badminton":2, "basketball":3, "beach-volleyball":4,
         "boxing":5, "canoeing":6, "cycling":7, "diving":8, "equestrianism":9,
         "fencing":10, "gymnastics":11, "handball":12, "hockey":13, "judo":14,
         "pentathlon":15, "rowing":16, "sailing":17, "shooting":18, "swimming":19,
         "swimming-sync":20, "tabletennis":21, "taekwondo":22, "tennis":23, "triathlon":24,
         "waterpolo":25, "weightlifting":26, "wrestling":27}

X = []
y = []

for user in community_dict.keys():
    if user in user_feature_graph.nodes():
        X.append(community_one_hot[user])
        y.append(label[community_dict[user]])

X = np.asarray(X)
y = np.asarray(y)

#train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

#K-NN
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))
print(f1_score(y_test, knn.predict(X_test), average='macro'))
print(f1_score(y_test, knn.predict(X_test), average='micro'))


#random-forest
rdf = RandomForestClassifier()
rdf.fit(X_train, y_train)
print(rdf.score(X_test, y_test))
print(f1_score(y_test, rdf.predict(X_test), average='macro'))
print(f1_score(y_test, rdf.predict(X_test), average='micro'))

#anything-else





0.5620915032679739
0.5593443857729572
0.5620915032679739
0.954248366013072
0.9440031678510941
0.954248366013072
