In [73]:
import pystan
import pandas as pd
import numpy as np
from scipy import stats
import os

In [131]:
path=os.path.abspath('..')
data = pd.read_csv(path+'\\data\\data.csv')

In [132]:
category = ['Type', 'Name', 'Breed1', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health']
for i in category:
    data[i]=data[i].astype("category")

In [133]:
df1 = pd.get_dummies(data[['Type', 'Name', 'Breed1', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health']])
data = pd.concat((data, df1), axis = 1)
data.drop(['Type', 'Name', 'Breed1', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health'], axis=1, inplace=True)

In [134]:
from sklearn.model_selection import train_test_split
x = data.drop(['AdoptionSpeed'], axis = 1)
y = data[['AdoptionSpeed']].replace([0,1,2,3,4],[1,2,3,4,5])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [135]:
print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)

X_train (11244, 284)
X_test (3749, 284)
y_train (11244, 1)
y_test (3749, 1)


In [87]:
m_logit = '''
   data {
    int K; 
    int<lower=1000> N; 
    int<lower=1> N2;
    int D; 
    int y[N]; 
    vector[D] x[N]; 
    vector[D] x_test[N2]; 
    }
  
  parameters {
    matrix[K, D] beta; 
  }
  
  model {
  // prior for beta 
  for (c in 1:K)
    beta[c] ~ normal(0,5);

  // likelihood of outcome
  for (i in 1:N)
    y[i] ~ categorical_logit(beta * x[i]); //softmax
  
    }
    generated quantities{
      vector[N2] output;
      for(i in 1:N2){
        output[i] = categorical_logit_rng(beta * x_test[i]);
    }
  }
'''
m_logit = pystan.StanModel(model_code = m_logit)

INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_c4a4e092e16b4480764975468afe633c NOW.
  tree = Parsing.p_module(s, pxd, full_module_name)


In [88]:
# Test
dat_logit = {
  'N': X_train.shape[0],
  'K': 5,
  'D': X_train.shape[1],
  'y': y_train.AdoptionSpeed,
  'x': X_train,
  'N2': X_test.shape[0], 
  'x_test': X_test
}

In [89]:
result_logit = m_logit.sampling(data=dat_logit, iter=1000, chains=4)



In [90]:
post_logit_path = result_logit['args']['sample_file'].decode("utf-8") 
post_logit = pd.read_csv(post_logit_path , skiprows=19).dropna()

pred_logit = post_logit.loc[:, post_logit.columns.str.contains('output')].apply(lambda x: stats.mode(x)[0][0], axis=0)
pred_logit_test = [int(i) for i in pred_logit]

In [91]:
# Train
dat_logit = {
  'N': X_train.shape[0],
  'K': 5,
  'D': X_train.shape[1],
  'y': y_train.AdoptionSpeed,
  'x': X_train,
  'N2': X_train.shape[0], 
  'x_test': X_train
}

In [94]:
result_logit = m_logit.sampling(data=dat_logit, , iter=1000, chains=4)



In [97]:
post_logit_path = result_logit['args']['sample_file'].decode("utf-8") 
post_logit = pd.read_csv(post_logit_path , skiprows=19).dropna()

pred_logit = post_logit.loc[:, post_logit.columns.str.contains('output')].apply(lambda x: stats.mode(x)[0][0], axis=0)
pred_logit_train = [int(i) for i in pred_logit]

In [104]:
from sklearn.metrics import accuracy_score
print("Test accuracy of Logistic Classification is", accuracy_score(y_test, pred_logit_test))
print("Train accuracy of Logistic Classification is", accuracy_score(y_train, pred_logit_train))

Test accuracy of Logistic Classification is 0.3078154174446519
Train accuracy of Logistic Classification is 0.35645677694770544


## Export to csv

In [127]:
df = pd.DataFrame(pred_logit_test)
df.to_csv(path+'\\result\\logistic_des.csv', index=False)

## w/o description

In [117]:
data2 = data.drop(data.columns[7:7+50], axis=1)

In [122]:
from sklearn.model_selection import train_test_split
x = data2.drop(['AdoptionSpeed'], axis = 1)
y = data2[['AdoptionSpeed']].replace([0,1,2,3,4],[1,2,3,4,5])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [123]:
print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)

X_train (11244, 234)
X_test (3749, 234)
y_train (11244, 1)
y_test (3749, 1)


In [124]:
# Test
dat_logit = {
  'N': X_train.shape[0],
  'K': 5,
  'D': X_train.shape[1],
  'y': y_train.AdoptionSpeed,
  'x': X_train,
  'N2': X_test.shape[0], 
  'x_test': X_test
}

result_logit = m_logit.sampling(data=dat_logit, iter=1000, chains=4)

post_logit_path = result_logit['args']['sample_file'].decode("utf-8") 
post_logit = pd.read_csv(post_logit_path , skiprows=19).dropna()

pred_logit = post_logit.loc[:, post_logit.columns.str.contains('output')].apply(lambda x: stats.mode(x)[0][0], axis=0)
pred_logit_test2 = [int(i) for i in pred_logit]



In [125]:
# Train
dat_logit = {
  'N': X_train.shape[0],
  'K': 5,
  'D': X_train.shape[1],
  'y': y_train.AdoptionSpeed,
  'x': X_train,
  'N2': X_train.shape[0], 
  'x_test': X_train
}

result_logit = m_logit.sampling(data=dat_logit, iter=1000, chains=4)

post_logit_path = result_logit['args']['sample_file'].decode("utf-8") 
post_logit = pd.read_csv(post_logit_path , skiprows=19).dropna()

pred_logit = post_logit.loc[:, post_logit.columns.str.contains('output')].apply(lambda x: stats.mode(x)[0][0], axis=0)
pred_logit_train2 = [int(i) for i in pred_logit]



In [126]:
from sklearn.metrics import accuracy_score
print("Test accuracy of Logistic Classification is", accuracy_score(y_test, pred_logit_test2))
print("Train accuracy of Logistic Classification is", accuracy_score(y_train, pred_logit_train2))

Test accuracy of Logistic Classification is 0.3835689517204588
Train accuracy of Logistic Classification is 0.37513340448239063


In [128]:
df = pd.DataFrame(pred_logit_test2)
df.to_csv(path+'\\result\\logistic_nodes.csv', index=False)

## KNN

w/o description

In [130]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_test = knn.predict(X_test)
y_pred_train = knn.predict(X_train)
print("Test accuracy of Knn Classification is", accuracy_score(y_test, y_pred_test))
print("Train accuracy of Knn Classification is", accuracy_score(y_train, y_pred_train))

  This is separate from the ipykernel package so we can avoid doing imports until


Test accuracy of Knn Classification is 0.3083488930381435
Train accuracy of Knn Classification is 0.5381536819637139


with description

In [136]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_test = knn.predict(X_test)
y_pred_train = knn.predict(X_train)
print("Test accuracy of Knn Classification is", accuracy_score(y_test, y_pred_test))
print("Train accuracy of Knn Classification is", accuracy_score(y_train, y_pred_train))

  This is separate from the ipykernel package so we can avoid doing imports until


Test accuracy of Knn Classification is 0.3134169111763137
Train accuracy of Knn Classification is 0.5403770900035575
