In [323]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import urllib 
from flask import Flask, request, jsonify
from markupsafe import escape
import json     

import json 
import dill
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_auc_score,roc_curve, precision_recall_curve
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

import sklearn.datasets
import sklearn
sklearn.set_config(display='diagram')

import dill
dill._dill._reverse_typemap['ClassType'] = type

## Шаг 1

In [324]:
df_train = pd.read_csv("diabetes_prediction_dataset.csv")
df_train.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0


In [325]:
#Для удобства пользователей сразу поменяем значения в столбцах gender и smoking_history
df_train['gender'] = df_train['gender'].replace({'Female': 0, 'Male': 1, 'Other': 0})
df_train['smoking_history'] = df_train['smoking_history'].replace({'No Info': 0, 'never': 0, 'not current': 0, 'current': 1, 'ever': 1, 'former': 1})

In [326]:
features = ['blood_glucose_level', 'age', 'hypertension', 'heart_disease', 'HbA1c_level', 'bmi', 'gender', 'smoking_history']
target = 'diabetes'

In [327]:
X_train, X_test, y_train, y_test = train_test_split(df_train, df_train['diabetes'], test_size=0.33, random_state=42)

#save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [328]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class SampleImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X[self.key]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
       
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [329]:
continuous_columns = ['blood_glucose_level', 'age', 'hypertension', 'heart_disease', 'HbA1c_level', 'bmi', 'gender', 'smoking_history']
final_transformers = list()
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('scaler', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))
    
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [330]:
pipeline = Pipeline([
    ('features', feats),
    ('random_forest_model', RandomForestClassifier()),
])
pipeline.fit(X_train, y_train)

In [331]:
with open("rf_diabetes.dill", "wb") as f:
    dill.dump(pipeline, f)

## Шаг 2

In [332]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [333]:
X_test.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,13.0,0,0,0,20.82,5.8,126,0
1,0,3.0,0,0,0,21.0,5.0,145,0
2,1,63.0,0,0,1,25.32,3.5,200,0


In [334]:
with open('rf_diabetes.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [335]:
predictions = pipeline.predict_proba(X_test)#.iloc[:500]
#pd.DataFrame({'preds': predictions[:, 1]}).to_csv("test_predictions.csv", index=None)

In [351]:
b=1.2
precision, recall, thresholds = precision_recall_curve(y_test, predictions[:, 1])

fscore = (1+(b**2))*(precision * recall) / ((b**2)*precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.59, F-Score=0.773, Precision=0.966, Recall=0.679


In [352]:
roc_auc_score(y_score=predictions[:, 1][:], y_true=y_test)

0.9614661054393043

## Шаг 3

In [338]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [339]:
def get_prediction(x):
    age, gender, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level = x
   
    
    body = {'age': age,
            'gender': gender,
            'gender_Other': gender_Other,
            'hypertension': hypertension, 
            'heart_disease': heart_disease,
            'smoking_history': smoking_history,
            'smoking_history_not_current': smoking_history_not_current,
            'bmi': bmi,
            'HbA1c_level': HbA1c_level,
            'blood_glucose_level': blood_glucose_level
            } 

    myurl = "http://127.0.0.1:5000/predict"
    req = urllib.request.Request(myurl)
    req.add_header('Content-Type', 'application/json; charset=utf-8')
    jsondata = json.dumps(body)
    jsondataasbytes = jsondata.encode('utf-8')   # needs to be bytes
    req.add_header('Content-Length', len(jsondataasbytes))
    print (jsondataasbytes)
    response = urllib.request.urlopen(req, jsondataasbytes)
    return json.loads(response.read())['predictions']

In [340]:
# # Пробный запуск Flask

# app = Flask(__name__)

# @app.route("/a")
# def hello():
#     return "Hello World!"

# if __name__ == '__main__':
#     app.run()