In [1]:
# Sample USPTO to test coding with limited memory

# 09-23-19
# Marcelo sugeriu Reservoir Sampling

# 09-18-19
# Renato Kogeyama

In [2]:
import csv
import pandas as pd
import random
import itertools
import sys
import numpy as np


from sklearn import linear_model
from sklearn import naive_bayes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plot
from sklearn.metrics import roc_curve, auc
from scipy import stats
import datetime

In [3]:
# citation_file=r"patent_citation.csv"
# patents_file="patent.csv"
fname='df_w_pat_cit_190926.csv'
sample_df='sample_df_w_pat_cit_190926.csv'

In [4]:
# dataset sample
# inverte a logica: ao inves de dizer quais linhas deseja, indica quais nao deseja ver (skip)
# informa essas linhas ao pd.read_csv

n = sum(1 for line in open(fname)) - 1 #number of records in file (excludes header)
s = 1000000 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
df = pd.read_csv(fname, skiprows=skip)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# converte categorias em dummies

df=pd.get_dummies(df, columns=['type', 'kind', 'country'])

In [None]:
# tamanho do dataset

nrow = len(df)
ncol = len(df.columns)

print('# of rows:', nrow)
print('# of columns:', ncol)

# of rows: 999899
# of columns: 28


In [None]:
# df3 - retirada dos outliers

df3=df

# preparacao para retirar outliers
mean=df3.mean(axis=0)
std_dev=df3.std(axis=0)

# threshold=3*std_dev.citation_id
threshold=3*std_dev

df3=df3[df3.citation_id-mean.citation_id<threshold.citation_id]
df3=df3[df3.num_claims-mean.num_claims<threshold.num_claims]

df3.shape

In [None]:
# Check the Dataset
def uniqueColValues(auxdf):
    for column in auxdf:
        print("{} | {} | {}".format(
            auxdf[column].name, len(auxdf[column].unique()), auxdf[column].dtype
        ))
        
uniqueColValues(df)

In [None]:
matplotlib inline #exibe os graficos

plot.subplot(1, 3, 1)
plot.title('# of citations \n with outliers')
plot.xlabel('citations') 
plot.ylabel('claims') 
plot.scatter(df['citation_id'], df['num_claims'], s=12)

plot.subplot(1, 3, 2)
plot.title('# of citations \n no outliers')
plot.xlabel('citations') 
plot.ylabel('claims') 
plot.scatter(df3['citation_id'], df3['num_claims'], s=1)

plot.subplot(1, 3, 3)
plot.title('# of citations \n histogram \n no outliers')
df3.citation_id.hist()

plot.tight_layout()

In [None]:
# # Naive Bayes
def test_naive_bayes(dataset):

    chosenColumns = ['num_claims','year', 'type_plant',
           'type_reissue', 'type_statutory invention registration', 'type_utility',
           'kind_A', 'kind_B1', 'kind_B2', 'kind_E', 'kind_E1', 'kind_H', 'kind_P',
           'kind_P2', 'kind_P3']

    myX = dataset.as_matrix(columns=chosenColumns)
    # myY = df2.as_matrix(columns=['citation_id'])
    myY = dataset.as_matrix(columns=['citation_id'])

    xTrain, xTest, yTrain, yTest = train_test_split(myX, myY, train_size=0.7, random_state=3)
    testSize = yTest.shape[0]


    # nb = naive_bayes.GaussianNB()

    # dado que o output eh count, uma distribuicao multinomial se adapta melhor
    nb = naive_bayes.MultinomialNB()

    # complementNB adapta o MultinomialNB para datasets muito desbalanceados
    # porem nao esta disponivel para a versao 0.19 do scikit-learn
    # nb = naive_bayes.ComplementNB()

    nb.fit(xTrain, yTrain.ravel())

    yPredNB = nb.predict(xTest) # predicting test data

    # computing error
    errorNB = np.sum((yPredNB[i] != yTest[i]) for i in range(0, testSize))
    errorNBPCT = int(100*errorNB/testSize)
    hitRateNBPCT = 100 - errorNBPCT
    return(testSize, errorNB, errorNBPCT, hitRateNBPCT)
   

In [None]:
#naive bayes with complete dataset

testSize, errorNB, errorNBPCT, hitRateNBPCT=test_naive_bayes(df)

print("----------Naive Bayes----------")
print(int(errorNB), "misclassified data out of", testSize)
print("Error PCT: ",errorNBPCT,'%')
print("Hit Rate:  ",hitRateNBPCT,'%')

In [None]:
#naive bayes without outliers

testSize, errorNB, errorNBPCT, hitRateNBPCT=test_naive_bayes(df3)

print("----------Naive Bayes----------")
print(int(errorNB), "misclassified data out of", testSize)
print("Error PCT: ",errorNBPCT,'%')
print("Hit Rate:  ",hitRateNBPCT,'%')