## Notebook Plan

Read in Patent data by year and clean to necessary features. This is necessary to actually get the data to run any of the other functions

In [1]:
import requests
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize

import matplotlib as mplib
import matplotlib.pyplot as plt
from datetime import datetime
import copy
import ast
from collections import Counter
import itertools
from ast import literal_eval

import time
from collections import Counter

%matplotlib inline

## Functions

These functions take the json-esque patent data and clean. Functions take number of cities by patents (assigned and invented), associated classifications, assignee types and any normalizations

In [2]:
#Output a directory for each patent's city and ipc section matched with its index number so that we can use the 
# index number to extract rows from dataframe for each city and ipc section
#
# The df to be passed in, is the patent data for a full year. 


def get_directory(df):
    for i in range(len(df)):
        ipc = literal_eval(df['IPCs'][i])
        for j in range(len(ipc)):  
            sec = ipc[j]['ipc_section']
            try:
                asgn = literal_eval(df['assignees'][i])
                for k in range(len(asgn)):
                    yield (i, ((asgn[k]['assignee_city'], asgn[k]['assignee_state']), sec))
            except SyntaxError:
                asgn = df.iloc[i]['assignees']
                for k in range(asgn.count('city')):
                    yield (i, ((asgn.split(':')[2+k*3].split(',')[0][3:-1], asgn[k]['assignee_state']), sec))

In [3]:
#Output a directory for each patent's city and ipc section matched with its index number so that we can use the 
# index number to extract rows from dataframe for each city and ipc section
#
# The df to be passed in, is the patent data for a full year.

def get_inv_directory(curr_df):
    df = curr_df.reset_index()
    for i in range(len(df)):
        ipc = literal_eval(df['IPCs'][i])
        for j in range(len(ipc)):  
            sec = ipc[j]['ipc_section']
            try:
                asgn = literal_eval(df['inventors'][i])
                for k in range(len(asgn)):
                    yield (i, ((asgn[k]['inventor_city'], asgn[k]['inventor_state']), sec))
            except SyntaxError:
                asgn = df.iloc[i]['inventors']
                for k in range(asgn.count('city')):
                    yield (i, ((asgn.split(':')[2+k*3].split(',')[0][3:-1],asgn[k]['inventor_state']) , sec))

In [4]:
#Output a list of tuples with unique values (city or ipc section) and corresponding number of patents
#directory: directory returned from above two functions
#num: 0-city, 1-ipc section
def count_unique(directory, num):
    counts = {}
    for tup in directory:
        counts[tup[1][num]] = counts.get(tup[1][num], 0) + 1
    return sorted(counts.items(), key=lambda x: x[1], reverse=True)

In [5]:
#Returns a dataframe containing only the information passed in. Hence the iloc
#df: compete dataframe with all patents
#directory: directory we get from above
#num: 0-city, 1-ipc section
#string: city or ipc section that we want to refine
def df_by_index(df, directory, num, string):
    return df.iloc[list(map(lambda x: x[0], filter(lambda x: x[1][num]==string, directory)))]

In [6]:
#count sections for the called city
def count_sections(curr_city, city_directory, section):
    temp_dir = [x for x in city_directory if x[1][0] == curr_city]
    temp = count_unique(temp_dir, 1)
    to_return = [x[1] for x in temp if x[0] == section]
    if len(to_return) == 1:
        return(to_return[0])
    else:
        return(0)

#To find IPC section distribution for inventor cities
def count_cities_by_section(df, city_directory):
    sections = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
    for i in sections:
        df[i] = df.city.apply(lambda x: count_sections(x, city_directory, i))
    df['sum'] = df.sum(axis=1)

In [7]:
# Get the assignee type for each patent
def get_assignee_type(x):
    to_get = literal_eval(x)
    assignee_type = to_get[0]['assignee_type']
    return(assignee_type)

In [8]:
#Get IPC counts for each patent
def count_IPCs(x, sections):
    to_return = literal_eval(x)
    for i in range(len(to_return)):
        sections.append(to_return[i]['ipc_section'])

In [9]:
#This function creates a dictionary that contains the top 10 Assignee cities
# Each key is a city. Within the city key is another dictionary containing IPC dist, location and inventor cities
def create_dict_of_cities(cities, dict_to_add, curr_df, assignee_dir, inventor_dir, year):
    start_total = time.time()
    for i in range(len(cities)):
        
        #dict_city is overarching dictionary for the current Assignee city
        #dict_inv is the dictionary for inventions from current Assignee city
        dict_city = {}
        dict_to_add[i] = dict_city
        city = cities[i][0]
        state = cities[i][1]
        dict_city['City'] = city
        dict_city['State'] = state
        dict_city['city_state'] = cities[i]
        dict_city['Year'] = year
        
        
        
        #get dataframe for current Assignee city
        df_temp = df_by_index(curr_df, assignee_dir, 0, cities[i])
        df_temp.reset_index(inplace = True)
        
        

        df_temp['assignee_type'] = df_temp['assignees'].apply(lambda x: get_assignee_type(x))
        for z in ['2','3','4','5','6','7','8','9']:
            key = "assignee_type_" + str(z)
            try:
                dict_city[key] = len(df_temp.loc[df_temp['assignee_type'] == z])
            except:
                dict_city[key] = 0
        
        #add total number of patents for a city
        num_patents = len(df_temp)
        dict_city["Patents"] = num_patents

        #Get directories and cities and sections for current city
        temp_directory = sorted(list(set(get_directory(df_temp))), key=lambda x: x[0])
        temp_sections = count_unique(temp_directory, 1)
        
        #Get inventor info for current city
        df_inv = df_by_index(curr_df, inventor_dir, 0, cities[i])
        num_inv_patents = len(df_inv)
        dict_city['inventor_patents'] = num_inv_patents
        
        #create patent_type_dict
        patent_types = ['utility', 'design', 'reissue', 'plant', 'statutory invention registration']
        values = df_temp.patent_type.value_counts()
        for j in patent_types:
            key = "pt_" + str(j)
            try:
                dict_city[key] = values[j]/num_patents
            except:
                dict_city[key] = 0
                
        
        section_names = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
        sections = []
        inv_sections = []
        placeholder = df_temp.IPCs.apply(lambda x: count_IPCs(x, sections))
        placeholder = df_inv.IPCs.apply(lambda x: count_IPCs(x, inv_sections))
        total = len(sections)
        total_inv = len(inv_sections)
        count = Counter(sections)
        inv_count = Counter(inv_sections)
        for k in section_names:
            key = "assignee_IPC_" + str(k)
            try:
                dict_city[key] = count[k]/total
            except:
                dict_city[key] = 0
            
            inv_key = 'inventor_IPC_' + str(k)
            try:
                dict_city[inv_key] = inv_count[k]/total_inv
            except:
                dict_city[inv_key] = 0
        
        
        #find number of inventor city mentions for curr_city
        try:
            dict_city['Inv_to_Assignee_ratio'] = num_inv_patents / num_patents
        except:
            dict_city['Inv_to_Assignee_ratio'] = 0
        
        #find total times patents have been cited in assignee and inventor city
        dict_city['assignee_pats_cited'] = int(df_temp.patent_num_foreign_citations.sum() + df_temp.patent_num_us_application_citations.sum())
        dict_city['inventor_pats_cited'] = int(df_inv.patent_num_foreign_citations.sum() + df_inv.patent_num_us_application_citations.sum())
        try:
            dict_city['assignee_pats_cited_ratio'] = dict_city['assignee_pats_cited'] / num_patents
        except:
            dict_city['assignee_pats_cited_ratio'] = 0
        try:
            dict_city['inventor_pats_cited_ratio'] = dict_city['inventor_pats_cited'] / num_inv_patents
        except:
            dict_city['inventor_pats_cited_ratio'] = 0

        if i % 200 == 0:
            print(time.time())
            print(str(i) + " of " + str(len(cities)))
            
    end_total = time.time()
    print("Total time : " + str(end_total-start_total))
        

## Data (and performance)

In [10]:
directory_dict = {}
top_cities_by_year = {}
top_sections_by_year = {}
inv_directory_dict = {}
inv_top_cities_by_year = {}

In [11]:
beginning_year = 1976
end_year = 2015

In [12]:
cities_df = []
for year in range(beginning_year, end_year):
    try:
        file_name = '/Volumes/Rohun Backups/Datasets/Capstone/Patents_by_year/patents_' + str(year) + '.csv'
        df_curr = pd.read_csv(file_name)
        print("Year: " + str(year) + " Number of patents: " + str(len(df_curr)))
        
        
        dict_dir = sorted(list(set(get_directory(df_curr))), key=lambda x: x[0])
        print('done with dict dir')
        dict_inv_dir = sorted(list(set(get_inv_directory(df_curr))), key=lambda x: x[0])
        print('done with inv dir')
        top_cities = count_unique(dict_dir, 0)
        print('done with top cities')
        top_inv_cities = count_unique(dict_inv_dir, 0)
        print("Dictionaries loaded")
        
        
        assignees_dict = {}
        top_cities = top_cities[:1000]
        top_cities_names = [x[0] for x in top_cities]
        create_dict_of_cities(top_cities_names, assignees_dict, df_curr, dict_dir, dict_inv_dir, year)

        for key in assignees_dict:
            temp = json_normalize(assignees_dict[key])
            cities_df.append(temp)
        print(len(cities_df))
        del df_curr
        del dict_dir
        del dict_inv_dir
        del top_cities
        del top_inv_cities
        del assignees_dict
        print("Done with year: " + str(year))
        print(" ")
        
        
    except:
        print("Year " + str(year) + " not loaded yet.")
        print(" ")
    
    

Year: 1976 Number of patents: 34796
done with dict dir
done with inv dir
done with top cities
Dictionaries loaded


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1555876925.124196
0 of 1000
1555876943.0165231
200 of 1000
1555876957.040902
400 of 1000
1555876969.36683
600 of 1000
1555876982.013901
800 of 1000
Total time : 69.83654403686523
1000
Done with year: 1976
 
Year: 1977 Number of patents: 34141
done with dict dir
done with inv dir
done with top cities
Dictionaries loaded
1555877007.6553779
0 of 1000
1555877024.211686
200 of 1000
1555877036.186683
400 of 1000
1555877047.9153001
600 of 1000
1555877059.530944
800 of 1000
Total time : 64.15569710731506
2000
Done with year: 1977
 
Year: 1978 Number of patents: 33356
done with dict dir
done with inv dir
done with top cities
Dictionaries loaded
1555877083.936128
0 of 1000
1555877101.081428
200 of 1000
1555877113.6261618
400 of 1000
1555877125.9063098
600 of 1000
1555877138.064165
800 of 1000
Total time : 66.96303701400757
3000
Done with year: 1978
 
Year: 1979 Number of patents: 32939
done with dict dir
done with inv dir
done with top cities
Dictionaries loaded
1555877163.267225
0 of 1000
15558

In [13]:
patent_data_cleaned = pd.concat(cities_df, ignore_index = True)

In [14]:
patent_data_cleaned.to_csv('patents_data_cleaned_all_years_new.csv')