In [1]:
import networkx as nx
import pandas as pd 
import matplotlib.pyplot as plt

In [22]:
df = pd.read_csv('escorts.txt', delim_whitespace=True, index_col=False)
df

Unnamed: 0,n1,n2,e,timestamp
0,1,1,0,1030831200
1,1,2,1,1030831200
2,1,3,0,1030831200
3,2,1,0,1032645600
4,3,4,0,1034892000
...,...,...,...,...
50627,8491,6621,0,1223589600
50628,9136,6622,1,1223589600
50629,6325,6456,1,1223676000
50630,10105,6623,1,1223676000


In [23]:
df_no_time = df.drop('timestamp', axis=1)
df_no_time.to_csv('escorts.csv', index =False, header=False)



In [24]:
df_no_time.head(50)

Unnamed: 0,n1,n2,e
0,1,1,0
1,1,2,1
2,1,3,0
3,2,1,0
4,3,4,0
5,3,4,0
6,4,5,0
7,5,6,0
8,6,7,1
9,6,8,1


In [43]:
#finding unique number of buyers and escorts
buyers = df.n1.unique().tolist()
escorts = df.n2.unique().tolist()

print(f'numbers of buyers: {len(buyers)}')
print(f'numbers of escorts: {len(escorts)}')


numbers of buyers: 10106
numbers of escorts: 6624


In [28]:
#finding most popular buyers and escorts
#we need to observe that the escorts that had less buyers and had rating of 1, will have better scores than the most visited ones
#so we need to normalize it

pop_buyers = df_no_time['n1'].value_counts().to_frame()

pop_escorts = df_no_time['n2'].value_counts().to_frame()

top10 = pop_escorts.head(10)
top10_list = top10['n2'].tolist() #FIX DEN HER

Unnamed: 0,n1
a2283,134
a244,132
a4205,128
a617,115
a1443,114
...,...
a5051,1
a5048,1
a5046,1
a5045,1


In [7]:
avg = []
for escort in top10_list:
    most_pop_esc = df_no_time[df_no_time['n2'] == escort]
    ratings = most_pop_esc['e'].tolist()
    avg.append(sum(ratings)/len(ratings))

print(avg)

[0.825, 0.8, 0.0, 0.7373737373737373, 1.0, 1.0, 1.0, -1.0, 0.9090909090909091, -0.5]


In [8]:
G = nx.from_pandas_edgelist(df_no_time, 'n1', 'n2', 'e')

In [9]:
nx.is_bipartite(G)

False

In [25]:
#creating different IDs for buyers(a) and escorts(b)
# df_no_time['n1'] = df_no_time['n1'].apply(lambda x: 'a' + x)
df_no_time['n1'] = 'a' + df_no_time['n1'].astype(str)

df_no_time['n2'] = 'b' + df_no_time['n2'].astype(str)
df_no_time.head()

Unnamed: 0,n1,n2,e
0,a1,b1,0
1,a1,b2,1
2,a1,b3,0
3,a2,b1,0
4,a3,b4,0


In [40]:
#projecting the bipartite graph using vectorized projection
G = nx.from_pandas_edgelist(df_no_time, 'n1', 'n2', 'e')
nx.is_bipartite(G)
proj_G = nx.bipartite.weighted_projected_graph(G, nodes = df_no_time.n2.unique())

In [42]:
#basic network analysis on projected network
# proj_G.size() #num of edges
print(proj_G.number_of_edges())
print(proj_G.number_of_nodes())

183465
6624
