## Notebook Plan

This notebook will create the scoring metrics used to measure innovation in our selected cities.

1. Create a new metric to measure the patent classification spread in a city
2. Create normalized (to 1) scores for each city. The metrics for these scores come from literature defined in the Progress Report. Contain patents, citations and classifications.

In [1]:
import requests
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize

import matplotlib as mplib
import matplotlib.pyplot as plt
from datetime import datetime
import copy
import ast
from collections import Counter
import itertools
from ast import literal_eval
import pandas_profiling
import time

from sklearn.cluster import KMeans
from sklearn import preprocessing



%matplotlib inline

### Read in Patent data and drop non-US based cities

In [2]:
df = pd.read_csv('patents_data_cleaned_all_years_new.csv')

In [3]:
df = df.loc[~df.State.isna()]

In [4]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], axis = 1, inplace = True)
print(df.shape)
df.head()

(38593, 48)


Unnamed: 0,City,Inv_to_Assignee_ratio,Patents,State,Year,assignee_IPC_A,assignee_IPC_B,assignee_IPC_C,assignee_IPC_D,assignee_IPC_E,...,pt_statutory invention registration,pt_utility,Rank,assignee_patent_annual_sum,inventor_patent_annual_sum,inventor_patents_perc,assignee_patents_perc,inventor_rank,assignee_pats_citations_normalized,inventor_pats_citations_normalized
0,New York,0.055923,4077,NY,1976,0.121036,0.157771,0.280063,0.042229,0.023391,...,0.0,0.955359,0,36052,26340,0.008656,0.113087,11,0.110902,0.006969
1,Washington,0.02184,1511,DC,1976,0.055236,0.136544,0.160407,0.012815,0.013699,...,0.0,0.998676,1,36052,26340,0.001253,0.041912,217,0.025431,0.00098
2,Pittsburgh,0.296154,1300,PA,1976,0.024614,0.216699,0.28668,0.015444,0.028958,...,0.0,0.987692,2,36052,26340,0.014617,0.036059,1,0.039576,0.015354
3,Chicago,0.305112,1252,IL,1976,0.136054,0.242282,0.179487,0.048142,0.030874,...,0.0,0.922524,3,36052,26340,0.014503,0.034728,2,0.036265,0.013285
4,Stamford,0.118511,1021,CT,1976,0.077848,0.200633,0.191139,0.041772,0.005063,...,0.0,0.94809,4,36052,26340,0.004594,0.02832,34,0.019938,0.005009


In [6]:
df.columns

Index(['City', 'Inv_to_Assignee_ratio', 'Patents', 'State', 'Year',
       'assignee_IPC_A', 'assignee_IPC_B', 'assignee_IPC_C', 'assignee_IPC_D',
       'assignee_IPC_E', 'assignee_IPC_F', 'assignee_IPC_G', 'assignee_IPC_H',
       'assignee_pats_cited', 'assignee_pats_cited_ratio', 'assignee_type_2',
       'assignee_type_3', 'assignee_type_4', 'assignee_type_5',
       'assignee_type_6', 'assignee_type_7', 'assignee_type_8',
       'assignee_type_9', 'city_state', 'inventor_IPC_A', 'inventor_IPC_B',
       'inventor_IPC_C', 'inventor_IPC_D', 'inventor_IPC_E', 'inventor_IPC_F',
       'inventor_IPC_G', 'inventor_IPC_H', 'inventor_patents',
       'inventor_pats_cited', 'inventor_pats_cited_ratio', 'pt_design',
       'pt_plant', 'pt_reissue', 'pt_statutory invention registration',
       'pt_utility', 'Rank', 'assignee_patent_annual_sum',
       'inventor_patent_annual_sum', 'inventor_patents_perc',
       'assignee_patents_perc', 'inventor_rank',
       'assignee_pats_citations_norm

### Create Classifcation Spread for both assigned and invented patents

In [7]:
#Spread of patents across classifications
# Higher return value is bad for innovation. 
def IPC_spread(row):
    assignee_vals = [row['assignee_IPC_A'], row['assignee_IPC_B'], row['assignee_IPC_C'], row['assignee_IPC_D'],
                     row['assignee_IPC_E'], row['assignee_IPC_F'], row['assignee_IPC_G'], row['assignee_IPC_H']]
    max_val = max(assignee_vals)
    min_val = min(assignee_vals)
    return(max_val - min_val)

#Spread of patents across classifications
# Higher return value is bad for innovation. 
def IPC_inv_spread(row):
    inventor_vals = [row['inventor_IPC_A'], row['inventor_IPC_B'], row['inventor_IPC_C'], row['inventor_IPC_D'],
                     row['inventor_IPC_E'], row['inventor_IPC_F'], row['inventor_IPC_G'], row['inventor_IPC_H']]
    max_inv_val = max(inventor_vals)
    min_inv_val = min(inventor_vals)
    return(max_inv_val - min_inv_val)

In [8]:
df['IPC_assig_spread'] = df.apply(lambda x: IPC_spread(x), axis = 1)
df['IPC_inv_spread'] = df.apply(lambda x: IPC_inv_spread(x), axis = 1)

### Create 7 scores that will be used to analyze innovation

In [9]:
min_max_scaler = preprocessing.MinMaxScaler()
scored_df = []

In [10]:
for year in range(1976,2015):
    temp_df = df.loc[df.Year == year]
    
    # Inventor Citations
    temp_df['Score1'] = temp_df['inventor_pats_citations_normalized']

    # Inventor and Assignee Citations
    temp_df['Score2'] = temp_df['inventor_pats_citations_normalized'] + temp_df['assignee_pats_citations_normalized']

    # Score 1 with classification spread
    temp_df['Score3'] = temp_df['Score1'] / (temp_df['IPC_inv_spread'])

    # Score 2 with classification spread
    temp_df['Score4'] = temp_df['Score2'] / (temp_df['IPC_inv_spread'] + temp_df['IPC_assig_spread'])

    # Number of invented patents
    temp_df['Score5'] = temp_df['inventor_patents']

    # Number of assigned patents
    temp_df['Score6'] = min_max_scaler.fit_transform(temp_df[['Patents']])


    temp_df = temp_df[~temp_df.isin([np.nan, np.inf, -np.inf]).any(1)]
    temp_df.dropna(inplace = True)
    temp_df['Score1'] = min_max_scaler.fit_transform(temp_df[['Score1']])
    temp_df['Score2'] = min_max_scaler.fit_transform(temp_df[['Score2']])
    temp_df['Score3'] = min_max_scaler.fit_transform(temp_df[['Score3']])
    temp_df['Score4'] = min_max_scaler.fit_transform(temp_df[['Score4']])
    temp_df['Score5'] = min_max_scaler.fit_transform(temp_df[['Score5']])

    # Total patents 
    temp_df['Score7'] = temp_df['Score5'] + temp_df['Score6']
    
    
    temp_df = temp_df[['city_state', 'Year', 'Score1', 'Score2', 'Score3', 'Score4', 'Score5', 'Score6', 'Score7']]
    scored_df.append(temp_df)
    del temp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-d

In [11]:
final_df = pd.concat(scored_df)

In [12]:
final_df.head()

Unnamed: 0,city_state,Year,Score1,Score2,Score3,Score4,Score5,Score6,Score7
0,"('New York', 'NY')",1976,0.453901,1.0,0.385934,1.0,0.408273,1.0,1.408273
1,"('Washington', 'DC')",1976,0.06383,0.224064,0.063153,0.245674,0.057554,0.370152,0.427706
2,"('Pittsburgh', 'PA')",1976,1.0,0.466017,0.64672,0.395511,0.690647,0.31836,1.009008
3,"('Chicago', 'IL')",1976,0.865248,0.420378,0.811358,0.487232,0.685252,0.306578,0.99183
4,"('Stamford', 'CT')",1976,0.326241,0.21165,0.188132,0.156708,0.215827,0.249877,0.465705


In [20]:
final_df['City'] = final_df.city_state.apply(lambda x: x.split("'")[1])
final_df['State'] = final_df.city_state.apply(lambda x: x.split("'")[3])
final_df['city_state'] = final_df['City'] + "_" + final_df['State']
final_df.head()

Unnamed: 0,city_state,Year,Score1,Score2,Score3,Score4,Score5,Score6,Score7,City,State
0,New York_NY,1976,0.453901,1.0,0.385934,1.0,0.408273,1.0,1.408273,New York,NY
1,Washington_DC,1976,0.06383,0.224064,0.063153,0.245674,0.057554,0.370152,0.427706,Washington,DC
2,Pittsburgh_PA,1976,1.0,0.466017,0.64672,0.395511,0.690647,0.31836,1.009008,Pittsburgh,PA
3,Chicago_IL,1976,0.865248,0.420378,0.811358,0.487232,0.685252,0.306578,0.99183,Chicago,IL
4,Stamford_CT,1976,0.326241,0.21165,0.188132,0.156708,0.215827,0.249877,0.465705,Stamford,CT


### Write to file

In [21]:
final_df.to_csv('cities_with_scores.csv')