In [1]:
import pandas as pd
import requests
import io
import numpy as np

#觀察資料

In [2]:
url = "https://github.com/jerrywu2013/Tensorflow_Data/raw/master/pima-indians-diabetes.csv"
s = requests.get(url).content
diabetes = pd.read_csv(io.StringIO(s.decode('utf-8')))

In [3]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Number_pregnant        768 non-null    int64  
 1   Glucose_concentration  768 non-null    float64
 2   Blood_pressure         768 non-null    float64
 3   Triceps                768 non-null    float64
 4   Insulin                768 non-null    float64
 5   BMI                    768 non-null    float64
 6   Pedigree               768 non-null    float64
 7   Age                    768 non-null    int64  
 8   Class                  768 non-null    int64  
 9   Group                  768 non-null    object 
dtypes: float64(6), int64(3), object(1)
memory usage: 60.1+ KB


In [4]:
diabetes.head()

Unnamed: 0,Number_pregnant,Glucose_concentration,Blood_pressure,Triceps,Insulin,BMI,Pedigree,Age,Class,Group
0,6,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,50,1,B
1,1,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,31,0,C
2,8,0.919598,0.52459,0.0,0.0,0.347243,0.253629,32,1,B
3,1,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,21,0,B
4,0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,33,1,C


In [5]:
#懷孕次數(Number_pregnant)
#口服葡萄糖耐量試驗中血漿葡萄糖濃度為2小時(Glucose_concentration)
#舒張壓(Blood_pressure (mm Hg))
#三頭肌皮膚褶皺厚度(Triceps (mm))
#2小時血清胰島素(Insulin (mu U/ml))
#體重指數（體重kg /（身高m）^ 2）(BMI (weight in kg/(height in m)^2))
#糖尿病家族函數(Pedigree)
#年齡(Age (years))
#分類(Class variable (0 or 1)) - 1為陽性 0為陰性
#群組(A,B,C)

#資料清理

In [6]:
cols_to_norm = ['Number_pregnant', 'Glucose_concentration', 'Blood_pressure', 'Triceps',
       'Insulin', 'BMI', 'Pedigree']

In [7]:
diabetes[cols_to_norm] = diabetes[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [8]:
diabetes.head()

Unnamed: 0,Number_pregnant,Glucose_concentration,Blood_pressure,Triceps,Insulin,BMI,Pedigree,Age,Class,Group
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,50,1,B
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,31,0,C
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,32,1,B
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,21,0,B
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,33,1,C


#模型函式

In [9]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [10]:
cols_to_norm = ['Number_pregnant', 'Glucose_concentration', 'Blood_pressure', 'Triceps', 'Insulin', 'BMI', 'Pedigree','Age']

In [11]:
df_data = pd.DataFrame(diabetes[cols_to_norm])
df_target = pd.DataFrame(diabetes['Class'])
df = pd.concat([df_data,df_target],axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df_data, df_target,
                                                    test_size = 0.3,
                                                    random_state=42)

In [13]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train.values.ravel())
knn.score(X_train,y_train)

0.7914338919925512

In [14]:
knn.predict(X_test)

array([1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1], dtype=int64)

In [15]:
knn_score = knn.score(X_train,y_train)

In [16]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
#決策樹

In [17]:
dtc = tree.DecisionTreeClassifier()
dtc.fit(X_train, y_train.values.ravel())
dtc.score(X_train,y_train)

1.0

In [18]:
dtc_score = dtc.score(X_train,y_train)

In [19]:
from sklearn.ensemble import RandomForestClassifier
#隨機森林

In [20]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train.values.ravel())
rf.score(X_train,y_train)

1.0

In [21]:
rf_score = rf.score(X_train,y_train)

In [22]:
from sklearn.naive_bayes import GaussianNB
#高斯貝氏分類器

In [23]:
gnb = GaussianNB()
gnb.fit(X_train, y_train.values.ravel())
gnb.score(X_train,y_train)

0.7672253258845437

In [24]:
gnb_score = gnb.score(X_train,y_train)

In [25]:
from sklearn.neural_network import MLPClassifier
#多層感知神經網路分類器

In [26]:
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20, 8), random_state=1)
mlp.fit(X_train, y_train.values.ravel())
mlp.score(X_train,y_train)

0.7821229050279329

In [27]:
mlp_score = mlp.score(X_train,y_train)

In [28]:
from sklearn.linear_model import LogisticRegression
#羅吉斯迴歸

In [29]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train.values.ravel())
logreg.score(X_train,y_train)

0.7616387337057728

In [30]:
logreg_score = logreg.score(X_train,y_train)

In [31]:
from sklearn.svm import SVC
#支援向量機

In [32]:
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train.values.ravel())
svm_classifier.score(X_train,y_train)

0.6797020484171322

In [33]:
svm_score = svm_classifier.score(X_train,y_train)

In [34]:
print("決策樹:",dtc_score,"\n隨機森林:",rf_score,"\n單純貝氏分類器:",
      gnb_score,"\n類神經網路;",mlp_score,"\n羅吉斯迴歸:",logreg_score,'\nK-近鄰演算法:',knn_score,"\n支援向量機:",svm_score)

決策樹: 1.0 
隨機森林: 1.0 
單純貝氏分類器: 0.7672253258845437 
類神經網路; 0.7821229050279329 
羅吉斯迴歸: 0.7616387337057728 
K-近鄰演算法: 0.7914338919925512 
支援向量機: 0.6797020484171322
