## K Nearest Neigbors Classifiers
Created: 4/13/2020 Last Modified: 4/13/2020
Author: Francisco Cid

In [3]:
#Library Imports
from utils import intake_data
from imblearn import over_sampling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import warnings
from sklearn.preprocessing import StandardScaler 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import accuracy_score

### Importing clean and upsampled data 

In [4]:
data_x, data_y = intake_data()

TEST


### Feature Reduction

Several feature reduction methods were tested. Sequential Forward Selection 
was chosen since it gave the highest accuracy. 

From http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/ : 
"In a nutshell, SFAs remove or add one feature at the time based on the classifier performance until a feature subset of the desired size k is reached."

In [5]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
warnings.simplefilter('ignore')
knn = KNeighborsClassifier()
sfs = sfs(knn, k_features=14, forward=True, floating=False,
          verbose=2, scoring='accuracy', cv=5)
sfs.fit(data_x, data_y)
feat_cols = list(sfs.k_feature_idx_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.3s finished

[2020-04-17 22:02:49] Features: 1/14 -- score: 0.6102745098039215[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.2s finished

[2020-04-17 22:02:50] Features: 2/14 -- score: 0.6141960784313725[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    0.2s finished

[2020-04-17 22:02:50] Features: 3/14 -- score: 0.6141960784313725[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

Feature reduction through sequential forward selection found that the following columns should be kept: 

In [6]:
data_x = data_x.iloc[:, feat_cols]

###### K Nearest Neighbors
K nearest neighbors is a supervised learning algorithm that stores all available cases and classifies new cases based on a similarity measure (e.g. distance functions) 

###### Hyperparameters

The following is straight from scikit-learn documentation:

***n_neighbors*** int, optional (default = 5)
Number of neighbors to use by default for kneighbors queries.

***weightsstr*** optional (default = ‘uniform’)
weight function used in prediction.
For uniform weights, all points in each neighborhood are weighted equally.

***metricstring***  default ‘minkowski’
The distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric.

In [7]:
print('Data Shape:',data_x.shape)
#crate a scaler
scaler = StandardScaler()

#create a KNN classifier
knn = KNeighborsClassifier()

#create a pipeline that does scaling, then KNN
pipe = Pipeline(steps=[('scaler', scaler),('knn', knn)])

#Set up the parameters you want to tune for each of your pipeline steps
#Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
    'knn__n_neighbors': list(range(1, 30)),  #find the best value of k
}

# pass the pipeline and the parameters into a GridSearchCV with a 5-fold cross validation
gs = GridSearchCV(pipe, param_grid, cv=5)
# call fit() on the GridSearchCV and pass in the unscaled data (X_values, Y_values)
gs.fit(data_x, data_y)
# print out the best_score_ and best_params_ from the GridSearchCV
print("Best Score:",gs.best_score_)
print("Chosen parameters:", gs.best_params_)

#Printing final accuracy
cross_val_score(gs, data_x, data_y, cv=5)
y_predict = cross_val_predict(gs, data_x, data_y, cv=5)
avg_acc = accuracy_score(data_y, y_predict)
print('Average Accuracy:', avg_acc)

Data Shape: (254, 14)
Best Score: 0.764
Chosen parameters: {'knn__n_neighbors': 1}
Average Accuracy: 0.7440944881889764


### Printing classification report and confusion matrix

In [13]:
#Confusion Matrix Function
#Create a confusion matrix
#Inputs:
#Y_Actual = Actual (true) labels
#Y_Predicted = Predicted labels from a classifier
#classifier = Name of classifier as a string for plt title
def create_cm (Y_Actual, Y_Predicted, classifier=None):
    try: 
        import seaborn as sn
    except:
        print('create_cm(Y_Actual, Y_Predicted, classifier) has a dependency on the library seaborn.')
        print('This library was not found, please install and try again.')
        return()
    if type(classifier) != str:
        print('Please input the name of the classifier for plt title as a str')
        return()

            
    data = {'Actual': Y_Actual, 'Predicted':Y_Predicted}
    df = pd.DataFrame(data, columns=['Actual', 'Predicted'])
    df['Predicted'].replace(0, 'Alive', inplace=True)
    df['Predicted'].replace(1, 'Dead', inplace=True)
    df['Actual'].replace(0, 'Alive', inplace=True)
    df['Actual'].replace(1, 'Dead', inplace=True)
    cm = pd.crosstab(df['Actual'], df['Predicted'],
                     rownames=['Actual'], colnames=['Predicted'])                                    
    #Print a formatted confusion matrix using a heatmap
    sn.heatmap(cm, annot=True)
    plt_title = 'Confusion Matrix for ' + classifier
    plt.title('Confusion Matrix for K Nearest Neighbors')
    plt.show()

In [17]:
print(classification_report(data_y, y_predict))
create_cm(data_y, y_predict)

              precision    recall  f1-score   support

         0.0       0.79      0.66      0.72       127
         1.0       0.71      0.83      0.76       127

    accuracy                           0.74       254
   macro avg       0.75      0.74      0.74       254
weighted avg       0.75      0.74      0.74       254

Please input the name of the classifier for plt title as a str


()