# 1. 資料匯入

In [53]:
import pandas as pd
pd.set_option("display.max_columns",10) #設定pandas最多顯示出10個欄位資訊
df = pd.read_csv("StudentsPerformance.csv",encoding = "big5") #15000筆
df.head()
# 資料來源：https://www.kaggle.com/spscientist/students-performance-in-exams

Unnamed: 0,gender,race/ethnicity,parental level of education,pedu,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,college,standard,none,72,72,74
1,female,group C,some college,college,standard,completed,69,90,88
2,female,group B,master's degree,master,standard,none,90,95,93
3,male,group A,associate's degree,college,free/reduced,none,47,57,44
4,male,group C,some college,college,standard,none,76,78,75


# 2.預處理

In [54]:
#delete race
del df['race/ethnicity']
df.head()

Unnamed: 0,gender,parental level of education,pedu,lunch,test preparation course,math score,reading score,writing score
0,female,bachelor's degree,college,standard,none,72,72,74
1,female,some college,college,standard,completed,69,90,88
2,female,master's degree,master,standard,none,90,95,93
3,male,associate's degree,college,free/reduced,none,47,57,44
4,male,some college,college,standard,none,76,78,75


In [55]:
#delete parental level of education
del df['parental level of education']
df.head()

Unnamed: 0,gender,pedu,lunch,test preparation course,math score,reading score,writing score
0,female,college,standard,none,72,72,74
1,female,college,standard,completed,69,90,88
2,female,master,standard,none,90,95,93
3,male,college,free/reduced,none,47,57,44
4,male,college,standard,none,76,78,75


## One-hot Encoding

In [56]:
df_gender = pd.get_dummies(df['gender'])  # gender transform
df_gender.sample(5)

Unnamed: 0,female,male
727,0,1
876,0,1
51,0,1
878,1,0
951,1,0


In [57]:
df_lunch = pd.get_dummies(df['lunch'])  # gender transform
df_lunch.sample(5)

Unnamed: 0,free/reduced,standard
998,0,1
968,0,1
200,0,1
272,1,0
734,1,0


In [58]:
df_preparation = pd.get_dummies(df['test preparation course'])  # test preparation course transform
df_preparation.sample(5)

Unnamed: 0,completed,none
895,0,1
408,1,0
158,1,0
682,0,1
474,1,0


In [59]:
df_ml0 = pd.merge(df, df_gender,left_index=True,right_index=True)  # 合併
df_ml0.sample(5)

Unnamed: 0,gender,pedu,lunch,test preparation course,math score,reading score,writing score,female,male
920,male,high school,free/reduced,none,69,70,67,0,1
113,female,college,standard,none,51,58,54,1,0
335,female,college,free/reduced,none,61,68,66,1,0
304,female,college,standard,completed,74,75,83,1,0
340,male,high school,free/reduced,none,58,61,52,0,1


In [60]:
df_ml1 = pd.merge(df_ml0, df_lunch,left_index=True,right_index=True)  # 合併
df_ml1.sample(5)

Unnamed: 0,gender,pedu,lunch,test preparation course,math score,...,writing score,female,male,free/reduced,standard
564,male,college,free/reduced,none,48,...,46,0,1,1,0
846,male,master,standard,completed,91,...,85,0,1,0,1
927,female,high school,free/reduced,completed,65,...,71,1,0,1,0
411,male,college,standard,completed,84,...,78,0,1,0,1
33,male,college,standard,none,40,...,38,0,1,0,1


In [61]:
df_ml = pd.merge(df_ml1,df_preparation,left_index=True,right_index=True)  # 合併
df_ml.sample(5)

Unnamed: 0,gender,pedu,lunch,test preparation course,math score,...,male,free/reduced,standard,completed,none
868,male,college,free/reduced,completed,78,...,1,1,0,1,0
615,female,high school,standard,none,60,...,0,0,1,0,1
833,female,high school,standard,completed,77,...,0,0,1,1,0
382,male,master,free/reduced,none,79,...,1,1,0,0,1
379,male,college,standard,none,66,...,1,0,1,0,1


### LabelEncoding

In [62]:
df_ml['pedu_rank'] = df_ml['pedu'].replace({'high school':1,'college':2,'master':3})    # 將學歷低, 中, 高轉換成數字1, 2, 3
df_ml.sample(5)

Unnamed: 0,gender,pedu,lunch,test preparation course,math score,...,free/reduced,standard,completed,none,pedu_rank
662,female,college,free/reduced,none,55,...,1,0,0,1,2
433,female,high school,free/reduced,none,47,...,1,0,0,1,1
606,female,college,standard,none,85,...,0,1,0,1,2
182,female,high school,standard,none,50,...,0,1,0,1,1
466,female,college,free/reduced,none,26,...,1,0,0,1,2


In [63]:
from sklearn.model_selection import train_test_split

X = df_ml[['female','male','pedu_rank','free/reduced','standard','math score','reading score','writing score']]
y = df_ml['completed']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [64]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

# 2. 單一分類器

### 2.1 單純貝式分析

In [65]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print(metrics.classification_report(y_test, gnb.predict(X_test)))

             precision    recall  f1-score   support

          0       0.69      0.74      0.72       188
          1       0.51      0.45      0.48       112

avg / total       0.62      0.63      0.63       300



### 2.2 Decision tree

In [66]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

tree = DecisionTreeClassifier(criterion='gini',max_depth=5)
tree.fit(X_train, y_train)
print(metrics.classification_report(y_test, tree.predict(X_test)))

             precision    recall  f1-score   support

          0       0.67      0.89      0.76       188
          1       0.58      0.25      0.35       112

avg / total       0.64      0.65      0.61       300



### 2.3 LogisticRegression

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

lr = LogisticRegression()
lr.fit(X_train, y_train)
print(metrics.classification_report(y_test, lr.predict(X_test)))

             precision    recall  f1-score   support

          0       0.72      0.86      0.78       188
          1       0.65      0.43      0.52       112

avg / total       0.69      0.70      0.68       300



### 2.4 KNN

In [68]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
print(metrics.classification_report(y_test, knn.predict(X_test)))

             precision    recall  f1-score   support

          0       0.65      0.92      0.76       188
          1       0.57      0.18      0.27       112

avg / total       0.62      0.64      0.58       300



### 2.5 SVC

In [69]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf")
svc.fit(X_train, y_train)
print(metrics.classification_report(y_test, svc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.69      0.93      0.79       188
          1       0.71      0.30      0.42       112

avg / total       0.70      0.69      0.65       300



# 3. VotingClassifier

In [70]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

clf1 = DecisionTreeClassifier(max_depth=5)
clf2 = KNeighborsClassifier(n_neighbors=2)
clf3 = SVC(kernel='rbf',probability=True) #probability要設成True(同時計算每個分類的機率)，classification_report才能work

eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),('svc', clf3)], voting='soft', weights=[3, 1, 1])  # weights可以調整
eclf.fit(X_train, y_train)
print(metrics.classification_report(y_test, eclf.predict(X_test)))

             precision    recall  f1-score   support

          0       0.69      0.86      0.76       188
          1       0.59      0.34      0.43       112

avg / total       0.65      0.67      0.64       300



  if diff:


# 4. Bagging

#### OOB

In [71]:
from sklearn.ensemble import BaggingClassifier

bagc = BaggingClassifier(n_estimators=100, oob_score=True)   # 100顆tree
bagc.fit(X,y)
print("oob_score(accuary):",bagc.oob_score_)

oob_score(accuary): 0.676


In [72]:
from sklearn.ensemble import BaggingClassifier

bagc = BaggingClassifier(n_estimators=100)
bagc.fit(X_train, y_train)
print(metrics.classification_report(y_test, bagc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.72      0.82      0.77       188
          1       0.61      0.46      0.53       112

avg / total       0.68      0.69      0.68       300



# 5. 隨機森林(Random Forest)

#### OOB

In [73]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', max_features='auto', oob_score=True)
rfc.fit(X,y)
print("oob_score(accuary):",rfc.oob_score_)

oob_score(accuary): 0.667


In [74]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
print(metrics.classification_report(y_test, rfc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.70      0.87      0.78       188
          1       0.63      0.37      0.46       112

avg / total       0.67      0.68      0.66       300



# 6. AdaBoost

In [75]:
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(n_estimators=100)
adb.fit(X_train, y_train)
print(metrics.classification_report(y_test, adb.predict(X_test)))

             precision    recall  f1-score   support

          0       0.70      0.84      0.77       188
          1       0.60      0.40      0.48       112

avg / total       0.66      0.68      0.66       300



## 7. Stacking

需要安裝 mlxtend: 
請安裝套件 pip install mlxtend

website: http://rasbt.github.io/mlxtend/

StackingClassifier: http://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/

StackingRegressor: http://rasbt.github.io/mlxtend/user_guide/regressor/StackingRegressor/

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier

clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier()
clf3 = GaussianNB()
clf4 = LogisticRegression()
meta_clf = SVC()
stacking_clf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4], meta_classifier=meta_clf)

clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)
clf4.fit(X_train, y_train)
stacking_clf.fit(X_train, y_train)

print('RNN Score:',clf1.score(X_test, y_test))
print('RF Score:',clf2.score(X_test, y_test))
print('GNB Score:',clf3.score(X_test, y_test))
print('Logistic Score:',clf4.score(X_test, y_test))
print('Stacking Score:',stacking_clf.score(X_test, y_test))

RNN Score: 0.6866666666666666
RF Score: 0.67
GNB Score: 0.6333333333333333
Logistic Score: 0.7
Stacking Score: 0.6533333333333333


## 8. XGBoost

需要安裝XGBoost:
請安裝套件 conda install -c anaconda py-xgboost

In [77]:

import xgboost as xgb
xgbc = xgb.XGBClassifier()   # 若是迴歸問題, 則是xgbr = xgb.XGBRegressor()
xgbc.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [78]:
xgbc.score(X_test, y_test)

  if diff:


0.71