## Notebook Plan

Adding a couple of features to cleaned data

1. Read in the previously cleaned data and add the following features:
    1. Inventor Rank - The rank of city based on the total patents invented in the city for said year
    2. Normalized assigned/invented patent citations


In [1]:
import pandas as pd
import numpy as np

## Go to below markdown

In [2]:
df = pd.read_csv('patents_data_cleaned_all_years_new.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,City,Inv_to_Assignee_ratio,Patents,State,Year,assignee_IPC_A,assignee_IPC_B,assignee_IPC_C,assignee_IPC_D,...,inventor_IPC_G,inventor_IPC_H,inventor_patents,inventor_pats_cited,inventor_pats_cited_ratio,pt_design,pt_plant,pt_reissue,pt_statutory invention registration,pt_utility
0,0,New York,0.055923,4077,NY,1976,0.121036,0.157771,0.280063,0.042229,...,0.111111,0.079365,228,64,0.280702,0.035075,0.0,0.008585,0.0,0.955359
1,1,Washington,0.02184,1511,DC,1976,0.055236,0.136544,0.160407,0.012815,...,0.142857,0.160714,33,9,0.272727,0.000662,0.0,0.0,0.0,0.998676
2,2,Pittsburgh,0.296154,1300,PA,1976,0.024614,0.216699,0.28668,0.015444,...,0.139219,0.156197,385,141,0.366234,0.005385,0.0,0.005385,0.0,0.987692
3,3,Chicago,0.305112,1252,IL,1976,0.136054,0.242282,0.179487,0.048142,...,0.122625,0.136442,382,122,0.319372,0.0623,0.0,0.015176,0.0,0.922524
4,4,Stamford,0.118511,1021,CT,1976,0.077848,0.200633,0.191139,0.041772,...,0.232044,0.099448,121,46,0.380165,0.045054,0.0,0.006856,0.0,0.94809


In [4]:
df['Rank'] = df['Unnamed: 0'].apply(lambda x: x % 1000)

In [16]:
def get_assignee_annual_sum(df_curr):
    patent_total = df_curr.Patents.sum()
    return(patent_total)
def get_inventor_annual_sum(df_curr):
    patent_total = df_curr.inventor_patents.sum()
    return(patent_total)

In [23]:
df['assignee_patent_annual_sum'] = 0
df['inventor_patent_annual_sum'] = 0

In [24]:
assignee_totals = []
inventor_totals = []
for year in range(1976, 2015):
    assignee_totals.append(get_assignee_annual_sum(df.loc[df.Year == year]))
    inventor_totals.append(get_inventor_annual_sum(df.loc[df.Year == year]))

    
df.assignee_patent_annual_sum = df.Year.apply(lambda x: assignee_totals[(x-1976)])
df.inventor_patent_annual_sum = df.Year.apply(lambda x: inventor_totals[(x-1976)])

In [26]:
df['inventor_patents_perc'] = df.inventor_patents / df.inventor_patent_annual_sum
df['assignee_patents_perc'] = df.Patents / df.assignee_patent_annual_sum

In [27]:
df.head()

Unnamed: 0.1,Unnamed: 0,City,Inv_to_Assignee_ratio,Patents,State,Year,assignee_IPC_A,assignee_IPC_B,assignee_IPC_C,assignee_IPC_D,...,pt_design,pt_plant,pt_reissue,pt_statutory invention registration,pt_utility,Rank,assignee_patent_annual_sum,inventor_patent_annual_sum,inventor_patents_perc,assignee_patents_perc
0,0,New York,0.055923,4077,NY,1976,0.121036,0.157771,0.280063,0.042229,...,0.035075,0.0,0.008585,0.0,0.955359,0,36052,26340,0.008656,0.113087
1,1,Washington,0.02184,1511,DC,1976,0.055236,0.136544,0.160407,0.012815,...,0.000662,0.0,0.0,0.0,0.998676,1,36052,26340,0.001253,0.041912
2,2,Pittsburgh,0.296154,1300,PA,1976,0.024614,0.216699,0.28668,0.015444,...,0.005385,0.0,0.005385,0.0,0.987692,2,36052,26340,0.014617,0.036059
3,3,Chicago,0.305112,1252,IL,1976,0.136054,0.242282,0.179487,0.048142,...,0.0623,0.0,0.015176,0.0,0.922524,3,36052,26340,0.014503,0.034728
4,4,Stamford,0.118511,1021,CT,1976,0.077848,0.200633,0.191139,0.041772,...,0.045054,0.0,0.006856,0.0,0.94809,4,36052,26340,0.004594,0.02832


In [29]:
df['inventor_rank'] = 0

In [46]:
final_rankings = []
for year in range(1976, 2015):
    rank_inventors = df.inventor_patents.loc[df.Year == year]
    rank_inventors = np.argsort(rank_inventors)
    rank_inventors = 1000 - rank_inventors
    final_rankings.append(rank_inventors)
final_rankings = [y for x in final_rankings for y in x]
len(final_rankings)

39000

In [None]:
df['inventor_rank'] = final_rankings

In [3]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,City,Inv_to_Assignee_ratio,Patents,State,Year,assignee_IPC_A,assignee_IPC_B,assignee_IPC_C,...,pt_plant,pt_reissue,pt_statutory invention registration,pt_utility,Rank,assignee_patent_annual_sum,inventor_patent_annual_sum,inventor_patents_perc,assignee_patents_perc,inventor_rank
0,0,0,New York,0.055923,4077,NY,1976,0.121036,0.157771,0.280063,...,0.0,0.008585,0.0,0.955359,0,36052,26340,0.008656,0.113087,11
1,1,1,Washington,0.02184,1511,DC,1976,0.055236,0.136544,0.160407,...,0.0,0.0,0.0,0.998676,1,36052,26340,0.001253,0.041912,217
2,2,2,Pittsburgh,0.296154,1300,PA,1976,0.024614,0.216699,0.28668,...,0.0,0.005385,0.0,0.987692,2,36052,26340,0.014617,0.036059,1
3,3,3,Chicago,0.305112,1252,IL,1976,0.136054,0.242282,0.179487,...,0.0,0.015176,0.0,0.922524,3,36052,26340,0.014503,0.034728,2
4,4,4,Stamford,0.118511,1021,CT,1976,0.077848,0.200633,0.191139,...,0.0,0.006856,0.0,0.94809,4,36052,26340,0.004594,0.02832,34


In [4]:
#Create a new column for normalized citation numbers
# assignee_pats_cited and inventor_pats_cited

In [7]:
final_assignee_citations = []
final_inventor_citations = []
for year in range(1976, 2015):
    assignee_citations = df.assignee_pats_cited.loc[df.Year == year]
    inventor_citations = df.inventor_pats_cited.loc[df.Year == year]
    sum_assignee = assignee_citations.sum()
    sum_inventor = inventor_citations.sum()
    assignee_citations = [i/sum_assignee for i in assignee_citations]
    inventor_citations = [i/sum_inventor for i in inventor_citations]
    final_assignee_citations.append(assignee_citations)
    final_inventor_citations.append(inventor_citations)

In [14]:
final_assignee_citations = [y for x in final_assignee_citations for y in x]
final_inventor_citations = [y for x in final_inventor_citations for y in x]

In [16]:
df['assignee_pats_citations_normalized'] = final_assignee_citations
df['inventor_pats_citations_normalized'] = final_inventor_citations

In [17]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,City,Inv_to_Assignee_ratio,Patents,State,Year,assignee_IPC_A,assignee_IPC_B,assignee_IPC_C,...,pt_statutory invention registration,pt_utility,Rank,assignee_patent_annual_sum,inventor_patent_annual_sum,inventor_patents_perc,assignee_patents_perc,inventor_rank,assignee_pats_citations_normalized,inventor_pats_citations_normalized
0,0,0,New York,0.055923,4077,NY,1976,0.121036,0.157771,0.280063,...,0.0,0.955359,0,36052,26340,0.008656,0.113087,11,0.110902,0.006969
1,1,1,Washington,0.02184,1511,DC,1976,0.055236,0.136544,0.160407,...,0.0,0.998676,1,36052,26340,0.001253,0.041912,217,0.025431,0.00098
2,2,2,Pittsburgh,0.296154,1300,PA,1976,0.024614,0.216699,0.28668,...,0.0,0.987692,2,36052,26340,0.014617,0.036059,1,0.039576,0.015354
3,3,3,Chicago,0.305112,1252,IL,1976,0.136054,0.242282,0.179487,...,0.0,0.922524,3,36052,26340,0.014503,0.034728,2,0.036265,0.013285
4,4,4,Stamford,0.118511,1021,CT,1976,0.077848,0.200633,0.191139,...,0.0,0.94809,4,36052,26340,0.004594,0.02832,34,0.019938,0.005009


In [19]:
df.to_csv('patents_data_cleaned_all_years_new.csv')