## Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
import warnings
warnings.filterwarnings('ignore')

## Voted Classifier Function

In [137]:
def votedclf(X,y,models=[]):
    X_train,X_test, y_train, y_test= train_test_split(X,y,test_size=0.25)
    tm=[]
    
    for model in models:
        print("Using: ", model)
        model.fit(X_train, y_train)
        tm.append(model)
        print("Train Score:",model.score(X,y),"\nTest Score:" ,model.score(X_test,y_test),'\n\n')
        
    print("Voted Accuracies:")
    for data in [[X_train,y_train],[X_test,y_test]]: 
        predictions=[]
        polled_preds=[]

        for i in range(data[0].shape[0]):
            preds=[]
            for model in tm:
                preds.append(model.predict([data[0][i]])[0])
            predictions.append(preds)

        for pred in predictions:
            count_0=0
            count_1=0
            for el in pred:
                if el==0:
                    count_0=count_0+1
                else:
                    count_1=count_1+1

            if count_0>count_1:
                polled_preds.append(0)
            else:
                polled_preds.append(1)

        polled_preds=np.array(polled_preds)

        diff=data[1]-polled_preds

        wrong=0

        for el in diff:
            if el!=0:
                wrong=wrong+1
        acc=(data[1].shape[0]-wrong)/data[1].shape[0]  
        
        print(acc)
    
    
    return tm
                

    

## Loading Data

In [3]:
data=pd.read_csv("./Data/data_newFeatures.csv")

In [7]:
data.head()

Unnamed: 0,G<105,P<4,BP<48,ST<12,IN<50,BMI<21,G<125_IN<125,G<105_AGE<30,G<105_BMI<30,G<100_BP<48,...,P/A,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.032258,1.0,85.0,66.0,29.0,88.5,26.6,0.351,31.0,0
1,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.047619,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.166667,5.0,116.0,74.0,20.0,111.0,25.6,0.201,30.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.344828,10.0,115.0,65.0,39.0,130.0,35.3,0.134,29.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.133333,4.0,110.0,92.0,40.5,119.0,37.6,0.191,30.0,0


### Target Separation

In [12]:
target=data.iloc[:,-1].values

In [130]:
models=[RandomForestClassifier(n_estimators=50),LogisticRegression(C=10000), KNeighborsClassifier(n_neighbors=5), SVC(C=100000, degree=4)]

### Testing with classic data

In [138]:
votedclf(data.iloc[:,:-1].values, target, models)

Using:  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Train Score: 0.9440104166666666 
Test Score: 0.7760416666666666 


Using:  LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Train Score: 0.7721354166666666 
Test Score

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=50,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=N

## Data Scaling 

In [139]:
data_scaled=MinMaxScaler((0,1)).fit_transform(data.iloc[:,:-1].values)

In [140]:
votedclf(data_scaled, target, models)

Using:  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Train Score: 0.94140625 
Test Score: 0.765625 


Using:  LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Train Score: 0.7708333333333334 
Test Score: 0.72916666666666

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=50,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=N

## Train Test Split

In [232]:
X_train, X_test, y_train, y_test= train_test_split(data.iloc[:,:-1].values, data.iloc[:,-1].values, test_size=0.25)

## Scaler Object

In [233]:
scaler=MinMaxScaler((0,1))

In [239]:
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

### KNN

In [240]:
clf=LogisticRegression(penalty='none',max_iter=100000)
clf.fit(X_train,y_train)
print(clf.score(X_test,y_test),clf.score(X_train,y_train))

0.7916666666666666 0.7881944444444444


## Exporting Models

In [238]:
import joblib

In [242]:
joblib.dump(scaler,"./Models/scaler.model")
joblib.dump(clf,"./Models/clf.model")

['./Models/clf.model']