In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

In [2]:
df = pd.read_csv('epinions.txt',sep='\t')

In [3]:

df['Sign'].value_counts()

 1    717667
-1    123705
Name: Sign, dtype: int64

In [4]:
df.columns

Index(['FromNodeId', 'ToNodeId', 'Sign'], dtype='object')

In [5]:
G = nx.DiGraph()

In [6]:
for index,rows in df.iterrows():
    u = rows['FromNodeId']
    v = rows['ToNodeId']
    s = rows['Sign']
    G.add_edge(u,v,sign=s)

In [14]:
in_pos = {}
in_neg = {}
for nodes in G.nodes():
    l = list(G.in_edges(nodes,data=True))
    pos = 0
    neg = 0
    for items in l:
        if items[2]['sign'] == 1:
            pos += 1
        else:
            neg += 1
    in_pos[nodes] = pos
    in_neg[nodes] = neg

In [15]:
nx.set_node_attributes(G,in_pos,'in_pos')

In [16]:
nx.set_node_attributes(G,in_neg,'in_neg')

In [17]:
out_pos = {}
out_neg = {}
for nodes in G.nodes():
    pos = 0
    neg = 0
    l = dict(G[nodes])
    for keys in l:
        if(l[keys]['sign'] == 1):
            pos += 1
        else:
            neg += 1
    out_pos[nodes] = pos
    out_neg[nodes] = neg

In [18]:
nx.set_node_attributes(G,out_pos,'out_pos')
nx.set_node_attributes(G,out_neg,'out_neg')

In [19]:
embed = {}
for edge in list(G.edges()):
    inward_i=[]
    inward_j=[]
    outward_i=[]
    outward_j=[]
    for items in list(G.in_edges(edge[0])):
        inward_i.append(items[0])
    for items in list(G.in_edges(edge[1])):
        inward_j.append(items[0])
    for keys in dict(G[edge[0]]):
        outward_i.append(keys)
    for keys in dict(G[edge[1]]):
        outward_j.append(keys)
    final_i = list(set(inward_i) | set(inward_j))
    final_j = list(set(outward_i) | set(outward_j))
    final = list(set(final_i) & set(final_j)) 
    embed[edge] = len(final)

In [20]:
nx.set_edge_attributes(G,embed,"embed")

In [21]:
dplus_in_v = {}
dminus_in_v = {}
dplus_out_u = {}
dminus_out_u = {}
total_out_u = {}
total_in_v = {}
X = dict(G.nodes(data=True))
for edge in list(G.edges()):
    dplus_in_v[edge] = X[edge[1]]['in_pos']
    dminus_in_v[edge] = X[edge[1]]['in_neg']
    dplus_out_u[edge] = X[edge[0]]['out_pos']
    dminus_out_u[edge] = X[edge[0]]['out_neg']
    total_out_u[edge] = dplus_out_u[edge]+dminus_out_u[edge]
    total_in_v[edge] = dplus_in_v[edge]+dminus_in_v[edge]
nx.set_edge_attributes(G,dplus_in_v,"dplus_in_v")
nx.set_edge_attributes(G,dminus_in_v,"dminus_in_v")
nx.set_edge_attributes(G,dplus_out_u,"dplus_out_u")
nx.set_edge_attributes(G,dminus_out_u,"dminus_out_u")
nx.set_edge_attributes(G,total_out_u,"total_out_u")
nx.set_edge_attributes(G,total_in_v,"total_in_v")

In [22]:
df = pd.DataFrame(columns=['edge','dplus_in_v','dminus_in_v','dplus_out_u','dminus_out_u','total_out_u','total_in_v','embed','sign'])

In [23]:
dplus_in_v=nx.get_edge_attributes(G,'dplus_in_v')
dminus_in_v=nx.get_edge_attributes(G,'dminus_in_v')
dplus_out_u=nx.get_edge_attributes(G,'dplus_out_u')
dminus_out_u=nx.get_edge_attributes(G,'dminus_out_u')
embed=nx.get_edge_attributes(G,'embed')

total_out_u=nx.get_edge_attributes(G,'total_out_u')
total_in_v=nx.get_edge_attributes(G,'total_in_v')
signCol=nx.get_edge_attributes(G,'sign')

In [24]:
df['edge']=dplus_in_v.keys()
df['dplus_in_v']=dplus_in_v.values()
df['dminus_in_v']=dminus_in_v.values()
df['dplus_out_u']=dplus_out_u.values()
df['dminus_out_u']=dminus_out_u.values()
df['embed']=embed.values()

df['total_out_u']=total_out_u.values()
df['total_in_v']=total_in_v.values()



df['sign']=signCol.values()

df['sign'].value_counts()

 1    717667
-1    123705
Name: sign, dtype: int64

In [25]:
from sklearn.model_selection import train_test_split
X = df.drop(['edge','sign','embed'],axis=1)
y = df['sign']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

In [27]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.9017679393849354

In [28]:
from sklearn.svm import SVC


In [None]:
svm = SVC()
svm.fit(X_train,y_train)
svm.score(X_test,y_test)

In [None]:
ann = MLPClassifer()
ann.fit(X_train,y_train)
ann.score(X_test,y_test)

In [14]:

from sklearn.model_selection import KFold
kf = KFold(n_splits=10)

for train_index, test_index in kf.split(X,y):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    pred = lr.predict(X_test)
    print(lr.score(X_test,y_test))

NameError: name 'X' is not defined