# Try to load models

In [1]:
import pickle
from sklearn import metrics
import pandas as pd
import numpy as np

In [2]:
models = pickle.load(open('models/models_trained.pkl', 'rb'))

In [3]:
models.keys()

dict_keys(['columns', 'feature_selection', 'imputer', 'standard_scaler', 'dimensionality_reduction', 'data', 'model_train'])

In [4]:
models['feature_selection']

{'method': 'ANOVA F-Score',
 'scaler': SelectKBest(k=4),
 'best_k': 4,
 'best_features_': ['Pregnancies', 'Glucose', 'BMI', 'Age']}

In [5]:
models['dimensionality_reduction']

{'method': 'PCA', 'best_k': 3, 'scaler': PCA(n_components=3)}

In [6]:
X_test = models['data']['after_preprocessing']['X_test']
y_test = models['data']['after_preprocessing']['y_test']

In [7]:
model = models['model_train']['model']
y_pred = model.predict(X_test)

In [8]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.71      0.80       150
           1       0.62      0.88      0.73        81

    accuracy                           0.77       231
   macro avg       0.77      0.79      0.76       231
weighted avg       0.81      0.77      0.78       231



# Test model with data inferences

In [9]:
data_inference = pd.read_csv('dataset/diabetes.csv')

In [10]:
X,y = data_inference.drop('Outcome',axis=1), data_inference['Outcome']

In [19]:
X.loc[0], y.loc[0]

(Pregnancies                   6.000
 Glucose                     148.000
 BloodPressure                72.000
 SkinThickness                35.000
 Insulin                       0.000
 BMI                          33.600
 DiabetesPedigreeFunction      0.627
 Age                          50.000
 Name: 0, dtype: float64,
 1)

In [11]:
models['columns'] = dict(models['columns'])
invalid_columns = models['columns']['invalid']
numerical_columns = models['columns']['numerical']
retained_columns = models['feature_selection']['best_features_']

numerical_imputer = models['imputer']['numerical']
scaler = models['standard_scaler']['scaler']
pca_scaler = models['dimensionality_reduction']['scaler']

In [12]:
retained_columns, invalid_columns

(['Pregnancies', 'Glucose', 'BMI', 'Age'],
 ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'])

In [13]:
def inputs(df_sample_input):
    #feature selection
    df_sample_input = df_sample_input.loc[:,retained_columns]
    
    # imputer for invalid and missing values
    for invalid_col in retained_columns:
        df_sample_input[invalid_col].replace({0:np.nan}, inplace=True)
    df_sample_input[retained_columns] = numerical_imputer.transform(df_sample_input[retained_columns])
    
    #scaling the features
    df_sample_input = scaler.transform(df_sample_input)
    
    #dimensionality reduction
    X_inputs = pca_scaler.transform(df_sample_input)
    
    return X_inputs

In [14]:
X_input = inputs(data_inference)

In [15]:
y_pred_knn = model.predict(X_input)

In [16]:
print(metrics.classification_report(y.to_numpy(),y_pred_knn))

              precision    recall  f1-score   support

           0       0.90      0.72      0.80       500
           1       0.62      0.85      0.71       268

    accuracy                           0.76       768
   macro avg       0.76      0.78      0.75       768
weighted avg       0.80      0.76      0.77       768

