In [1]:
import networkx as nx
from networkx.algorithms import bipartite
import random
from tqdm import tqdm

In [11]:
edges_dict = {}
user_nodes = set()
tag_nodes = set()
with open('data/datasets/munmun_twitterex_ut/out.munmun_twitterex_ut', "r") as f:
    lines = f.readlines()
    for line in lines[1:]:
        data = line.strip().split(" ")
        if int(data[0]) <= 10000:
            user = 'user' + data[0]
            tag = 'tag' + data[1]
            user_nodes.add(user)
            tag_nodes.add(tag)
            
            if (user, tag) not in edges_dict:
                edges_dict[(user, tag)] = {
                    'weight': 1,
                    'timestamp': float(data[3])}
            else:
                edges_dict[(user, tag)]['weight'] += 1

In [17]:
len(tag_nodes)

116771

In [12]:
edges = [(v['timestamp'], (k[0], k[1], v['weight'])) for k, v in edges_dict.items()]

In [13]:
edges = sorted(edges, key=lambda x: x[0], reverse=False)

feature_edges = [e[1] for e in edges[:int(len(edges)*0.7)]]
label_edges = {(e[1][0], e[1][1]): 1 for e in edges[int(len(edges)*0.7):]}

B = nx.Graph()
#B.add_nodes_from(user_nodes, bipartite=0)
#B.add_nodes_from(tag_nodes, bipartite=1)
B.add_weighted_edges_from(feature_edges)


In [14]:
B.number_of_edges(), B.number_of_nodes()

(178788, 96375)

In [22]:
nodes = list(set(edge[0] for edge in label_edges.keys()))
top_nodes = {n for n, d in B.nodes(data=True) if d["bipartite"] == 1}
G = bipartite.projected_graph(B, top_nodes)

In [30]:
B.remove_nodes_from(set(user_nodes) - set(nodes))

In [31]:
B.number_of_nodes()

619066

In [32]:
top_nodes = {n for n, d in B.nodes(data=True) if d["bipartite"] == 1}
G = bipartite.projected_graph(B, top_nodes)
G.number_of_nodes()

530418

In [28]:
possible = set()
neighbors = set(B.neighbors(nodes[0]))
user_neighbors = set()
for tag in neighbors:
    user_neighbors.update(B.neighbors(tag))
for user in user_neighbors:
    possible.update(set(B.neighbors(user)) - neighbors)

In [29]:
len(possible)

35377

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split

from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks

In [22]:
features = pd.read_csv('data/clean_datasets/munmun_twitterex_ut_3.csv').set_index('Unnamed: 0')

In [25]:
feature_names = list(features.columns)
feature_names.remove('label')
X_train, X_test, y_train, y_test = train_test_split(features[feature_names].values, features['label'], test_size=0.3, random_state=0)

In [27]:
undersample = TomekLinks(sampling_strategy='majority')
X_train_us, y_train_us = undersample.fit_resample(X_train, y_train)