In [3]:
import IPython
from pathlib import Path
import pandas as pd
import pickle

notebook_name = "/".join(
        IPython.extract_module_locals()[1]["__vsc_ipynb_file__"].split("/")[-5:]
    )
print(notebook_name)
root_folder = Path(notebook_name).parent.parent
print(root_folder)

data_path = Path(root_folder, "data", "asserters_filtered.pkl")
# read pd.DataFrame from pickle
with open(data_path, "rb") as f:
    data = pd.read_pickle(f)

data.head(10)

d:\projects\aalto\hands_on_networks\patents\notebooks\convert_to_gephi_format.ipynb
d:\projects\aalto\hands_on_networks\patents


Unnamed: 0,case_node_id,Case Title,Civil Action #,Venue,Filing Date,included in random sample,DJ,patents,Alleged Infringer,paragraph_id,Patent Asserter,asserter_id,Asserter Category,i_aliases,pa_aliases
10,118532,Fulhorst v. Toyota Motor Corp.,2:00-cv-00071,E.D.Tex.,2000-04-27,0,0,4523178,[Toyota Motor Corporation],4025,George E Fulhorst,68571,9.0,[Toyota Motor Corporation],George E Fulhorst
24,118541,Pitney Bowes v. Stamps.Com Inc,5:00-cv-00262,E.D.Tex.,2000-09-18,0,0,,"[Stamps.com, Inc.]",4039,Pitney Bowes Inc,94702,8.0,"[Stamps.com, Inc.]",Pitney Bowes Inc
31,118547,"Texas Instruments, et al v. Linear Technology",2:01-cv-00003,E.D.Tex.,2001-01-06,0,0,4471292; 4794277; 4893091; 4906863; 5390069,[Linear Technology Corporation],4046,Texas Instruments Incorporated,108803,8.0,[Linear Technology Corporation],Texas Instruments Incorporated
32,118548,Texas Instruments v. Linear Technology,2:01-cv-00004,E.D.Tex.,2001-01-06,1,0,4884674; 5216613; 6039168,[Linear Technology Corporation],4047,Texas Instruments Incorporated,108803,8.0,[Linear Technology Corporation],Texas Instruments Incorporated
33,118549,"Natl Instruments v. MathWorks Inc, The",2:01-cv-00011,E.D.Tex.,2001-01-25,0,0,4901221; 4914568; 5291587; 5301336,"[The MathWorks, Inc.]",4048,National Instruments Corporation,88748,8.0,"[The MathWorks, Inc.]",National Instruments Corporation
34,118549,"Natl Instruments v. MathWorks Inc, The",2:01-cv-00011,E.D.Tex.,2001-01-25,0,0,4901221; 4914568; 5291587; 5301336,"[The MathWorks, Inc.]",4049,National Instruments Corporation,88748,8.0,"[The MathWorks, Inc.]",National Instruments Corporation
37,118552,Paragon Luggage Inc v. Tumi Inc,4:01-cv-00089,E.D.Tex.,2001-03-07,0,0,,"[Tumi, Inc.]",4052,Paragon Luggage Inc,92797,8.0,"[Tumi, Inc.]",Paragon Luggage Inc
38,118553,Paragon Luggage Inc v. Coach Inc,4:01-cv-00090,E.D.Tex.,2001-03-07,1,0,,"[Coach, Inc.]",4053,Paragon Luggage Inc,92797,8.0,"[Coach, Inc.]",Paragon Luggage Inc
40,118555,Paragon Luggage Inc v. Coach Inc,4:01-cv-00098,E.D.Tex.,2001-03-15,0,0,,"[Coach, Inc.]",4055,Paragon Luggage Inc,92797,8.0,"[Coach, Inc.]",Paragon Luggage Inc
48,118562,Texas Instruments v. Tessera Inc,2:01-cv-00163,E.D.Tex.,2001-08-03,0,0,,"[Tessera, Inc.]",4063,Texas Instruments Incorporated,108803,8.0,"[Tessera, Inc.]",Texas Instruments Incorporated


In [4]:

from collections import Counter

import pandas as pd
import networkx as nx

# function to filter out all asserters with less than threshold count and all infrindgements with less than threshold count
def filter_asserters(data, threshold):
    data_ = data.copy()
    asserters = data_["pa_aliases"]
    # each element in just a string
    count_asserters = Counter(asserters)
    # filter out all asserters with less than threshold count
    asserters = [asserter for asserter in asserters if count_asserters[asserter] >= threshold]
    # remove all rows with pa_aliases not in asserters
    data_ = data_[data_["pa_aliases"].apply(lambda x: x in asserters)]
    infrindgements = [i for l in data_["i_aliases"] for i in l]
    count_infrindgements = Counter(infrindgements)
    # filter out all infrindgements with less than threshold count
    infrindgements = [infrindgement for infrindgement in infrindgements if count_infrindgements[infrindgement] >= threshold]
    # remove all elements in i_aliases not in infrindgements
    data_["i_aliases"] = data_["i_aliases"].apply(lambda x: [i for i in x if i in infrindgements])
    # remove all rows with empty i_aliases
    data_ = data_[data_["i_aliases"].apply(lambda x: len(x) > 0)]
    return data_

def make_years_data(data, start_year, end_year):
    data_2_years = data[data["i_date"].apply(lambda x: x.year < end_year)]
    return data_2_years[data_2_years["i_date"].apply(lambda x: x.year >= start_year)]


def make_network(df, min_degree):
    G = nx.DiGraph()
    for _, row in df.iterrows():
        pa_alias = row['pa_aliases']
        for i_alias in row['i_aliases']:
            if pd.notna(i_alias) and pd.notna(pa_alias):
                # Add nodes
                G.add_node(i_alias)
                G.add_node(pa_alias)
                # Add edge from pa_alias to i_alias
                if G.has_edge(pa_alias, i_alias):
                    G[pa_alias][i_alias]['weight'] += 1
                else:
                    G.add_edge(pa_alias, i_alias, weight=1)
    # Remove nodes with degree less than min_degree
    print(f"Number of nodes before removing nodes with degree less than {min_degree}: {G.number_of_nodes()}")
    G.remove_nodes_from([node for node, degree in dict(G.degree()).items() if degree < min_degree])
    print(f"Number of nodes after removing nodes with degree less than {min_degree}: {G.number_of_nodes()}")
    return G

In [5]:
import numpy as np

# filter out all asserters with less than 10 count and all infrindgements with less than 10 count
filtered_data = filter_asserters(data, 10)

In [16]:
# now we want to download the data, filter it and make a network
# we want to make it to use in gephi, so we need to save the network in a file with .gexf extension


# introduce dynamic variables
# You define a single network but add 'start' and 'end' attributes to set when a node/edge appears or vanish.

from ast import main


def make_dynamic_network(df):

    # Assuming 'i_date' is your timestamp column
    df["i_date"] = pd.to_datetime(df["Filing Date"])

    # Define start and end years
    start_year = 2000
    end_year = 2022

    # make 3 years thresholds
    year_thresholds = np.arange(start_year, end_year, 3)

    # iterate over the years, make related data, make network with start and end attributes, add it to the dynamic graph
    main_graph = nx.MultiDiGraph()

    for i in range(len(year_thresholds) - 1):
        start_year = int(year_thresholds[i])
        end_year = int(year_thresholds[i + 1])
        data_2_years = make_years_data(df, start_year, end_year)
        G = make_network(data_2_years, 2)
        for edge in G.edges(data=True):
            main_graph.add_edge(edge[0], edge[1], weight=edge[2]['weight'], start=start_year, end=end_year)

    return main_graph

In [17]:
# make network and save it to a file
G_ = make_dynamic_network(filtered_data)

# save the network in a file with .gexf extension
network_path = Path(root_folder, "data", "network_th_10_3years.gexf")
# add dynamic attributes to the network
nx.write_gexf(G_, network_path)

Number of nodes before removing nodes with degree less than 2: 409
Number of nodes after removing nodes with degree less than 2: 250
Number of nodes before removing nodes with degree less than 2: 590
Number of nodes after removing nodes with degree less than 2: 390
Number of nodes before removing nodes with degree less than 2: 912
Number of nodes after removing nodes with degree less than 2: 645
Number of nodes before removing nodes with degree less than 2: 1273
Number of nodes after removing nodes with degree less than 2: 1061
Number of nodes before removing nodes with degree less than 2: 1556
Number of nodes after removing nodes with degree less than 2: 1373
Number of nodes before removing nodes with degree less than 2: 1408
Number of nodes after removing nodes with degree less than 2: 1190
Number of nodes before removing nodes with degree less than 2: 1079
Number of nodes after removing nodes with degree less than 2: 843


In [None]:
# make dynamic network with Stinger
