In [1]:
import networkx as nx
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
#reading the data
df = pd.read_csv('escorts.txt', delim_whitespace=True, index_col=False)
df

Unnamed: 0,n1,n2,e,timestamp
0,1,1,0,1030831200
1,1,2,1,1030831200
2,1,3,0,1030831200
3,2,1,0,1032645600
4,3,4,0,1034892000
...,...,...,...,...
50627,8491,6621,0,1223589600
50628,9136,6622,1,1223589600
50629,6325,6456,1,1223676000
50630,10105,6623,1,1223676000


In [3]:
#getting rid of the timestamp
df_no_time = df.drop('timestamp', axis=1)
df_no_time.to_csv('escorts.csv', index =False, header=False)



In [4]:
df_no_time.head(50)

Unnamed: 0,n1,n2,e
0,1,1,0
1,1,2,1
2,1,3,0
3,2,1,0
4,3,4,0
5,3,4,0
6,4,5,0
7,5,6,0
8,6,7,1
9,6,8,1


In [5]:
#finding unique number of buyers and escorts
buyers = df.n1.unique().tolist()
escorts = df.n2.unique().tolist()

print(f'numbers of buyers: {len(buyers)}')
print(f'numbers of escorts: {len(escorts)}')


numbers of buyers: 10106
numbers of escorts: 6624


In [6]:
#finding most popular buyers(biggest amount of payments) and escorts(how many times visited)
#we need to observe that the escorts that had less buyers and had rating of 1, will have better scores than the most visited ones
#so we need to normalize it

pop_buyers = df_no_time['n1'].value_counts().to_frame()

pop_escorts = df_no_time['n2'].value_counts().to_frame()

top10 = pop_escorts.head(10)
top10_list = top10['n2'].tolist() #FIX DEN HER

In [7]:
#average rating of top 10 most visited escorts (top 10 degrees)
avg = []
for escort in top10_list:
    most_pop_esc = df_no_time[df_no_time['n2'] == escort]
    ratings = most_pop_esc['e'].tolist()
    avg.append(sum(ratings)/len(ratings))

print(avg)

[0.825, 0.8, 0.0, 0.7373737373737373, 1.0, 1.0, 1.0, -1.0, 0.9090909090909091, -0.5]


In [8]:
G = nx.from_pandas_edgelist(df_no_time, 'n1', 'n2', 'e')

In [39]:
print(G.size()) #unique edge count
print(G.number_of_nodes())
print(df_no_time.shape[0]) #total number of rows (numebr of encounters)
print(pop_buyers['n1'].mean()) #average degree of buyers (how many escorts they rated)
print(pop_escorts['n2'].mean()) #average degreee of escorts (how many times they've been rated)
print(df_no_time['e'].mean()) #average rating (most of them satisfied ;))
print(nx.is_connected(G)) 

39044
16730
50632
5.0100930140510584
7.643719806763285
0.7426923684626323
False


In [46]:
#Largest Connected Component
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
LCC = G.subgraph(Gcc[0])
print(LCC.size())
print(G.size() - LCC.size()) #how many unique edges are out of the LCC

38540
504


In [10]:
#creating different IDs for buyers(a) and escorts(b), so the nx function could read it
# df_no_time['n1'] = df_no_time['n1'].apply(lambda x: 'a' + x)
df_no_time['n1'] = 'a' + df_no_time['n1'].astype(str)

df_no_time['n2'] = 'b' + df_no_time['n2'].astype(str)
df_no_time.head()

Unnamed: 0,n1,n2,e
0,a1,b1,0
1,a1,b2,1
2,a1,b3,0
3,a2,b1,0
4,a3,b4,0


In [11]:
#projecting the bipartite graph using vectorized projection
G = nx.from_pandas_edgelist(df_no_time, 'n1', 'n2', 'e')
nx.is_bipartite(G)
proj_G = nx.bipartite.weighted_projected_graph(G, nodes = df_no_time.n2.unique()) #is it simple weights or vectorized?

In [12]:
#basic network analysis on projected network
# proj_G.size() #num of edges
print(proj_G.number_of_edges())
print(proj_G.number_of_nodes())

183465
6624
