In [None]:
import pandas as pd
import numpy as np
%matplotlib inline 
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv("/content/datauntukklasifikasi.csv",sep=",")
df = df.drop(columns =['Unnamed: 0'])
x = df.drop(columns = ["lulus_4_tahun"])
y = df["lulus_4_tahun"]

#Handling Imbalance Data

In [None]:
#Random Over Sampling
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

x = df.drop(columns = ["lulus_4_tahun"]).values
y = df["lulus_4_tahun"].values

ros = RandomOverSampler(random_state=0)
x_ros, y_ros = ros.fit_resample(x,y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

Original dataset shape Counter({0: 589, 1: 181})
Resample dataset shape Counter({0: 589, 1: 589})


In [None]:
#Random Under Sampling
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

x = df.drop(columns = ["lulus_4_tahun"]).values
y = df["lulus_4_tahun"].values

rus = RandomUnderSampler(random_state=0)
x_rus, y_rus = rus.fit_resample(x,y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_rus))

Original dataset shape Counter({0: 589, 1: 181})
Resample dataset shape Counter({0: 181, 1: 181})


In [None]:
#SMOTE
from imblearn.over_sampling import SMOTE, ADASYN

smote = SMOTE()

x = df.drop(columns = ["lulus_4_tahun"]).values
y = df["lulus_4_tahun"].values

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(x, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_smote))

Original dataset shape Counter({0: 589, 1: 181})
Resample dataset shape Counter({0: 589, 1: 589})


#Check Classification Model ( without tuning)

In [None]:
# imbalance data
models = [GaussianNB(), DecisionTreeClassifier(), SVC(),RandomForestClassifier(),LogisticRegression()]
names = ["Naive Bayes", "Decision Tree", "SVM","Random Forest","Logistic Regression"]
for model, name in zip(models, names):
    print(name)
    for score in ["accuracy","precision","recall","f1","roc_auc"]:
        print(score, ":", cross_val_score(model, x, y,scoring=score, cv=10).mean())
    print(" ")

Naive Bayes
accuracy : 0.8883116883116884
precision : 0.7274947760219408
recall : 0.9777777777777779
f1 : 0.8219303065017349
roc_auc : 0.9828625235404896
 
Decision Tree
accuracy : 1.0
precision : 1.0
recall : 1.0
f1 : 1.0
roc_auc : 1.0
 
SVM
accuracy : 0.7415584415584415


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


precision : 0.08535885167464115
recall : 0.15555555555555556
f1 : 0.10336912254720473
roc_auc : 0.9289077212806027
 
Random Forest
accuracy : 1.0
precision : 1.0
recall : 1.0
f1 : 1.0
roc_auc : 1.0
 
Logistic Regression
accuracy : 0.964935064935065
precision : 0.8992048670062252
recall : 1.0
f1 : 0.9402367218908573
roc_auc : 0.9847457627118643
 


In [None]:
#ros
models = [GaussianNB(), DecisionTreeClassifier(), SVC(),RandomForestClassifier(),LogisticRegression()]
names = ["Naive Bayes", "Decision Tree", "SVM","Random Forest","Logistic Regression"]
for model, name in zip(models, names):
    print(name)
    for score in ["accuracy","precision","recall","f1","roc_auc"]:
        print(score, ":", cross_val_score(model, x_ros, y_ros,scoring=score, cv=10).mean())
    print(" ")

Naive Bayes
accuracy : 0.9313414457482254
precision : 0.8882181264993649
recall : 0.9983050847457626
f1 : 0.9379931032063947
roc_auc : 0.9928939365422144
 
Decision Tree
accuracy : 1.0
precision : 1.0
recall : 1.0
f1 : 1.0
roc_auc : 1.0
 
SVM
accuracy : 0.8280819933362306
precision : 0.8067280746539597
recall : 1.0
f1 : 0.876990728854657
roc_auc : 0.9383560015453348
 
Random Forest
accuracy : 1.0
precision : 1.0
recall : 1.0
f1 : 1.0
roc_auc : 1.0
 
Logistic Regression
accuracy : 0.9771041576126323
precision : 0.9604022132146364
recall : 1.0
f1 : 0.978787681914359
roc_auc : 0.9850043091065785
 


In [None]:
#RUS
models = [GaussianNB(), DecisionTreeClassifier(), SVC(C=1,gamma =1,kernel='rbf'),RandomForestClassifier(),LogisticRegression()]
names = ["Naive Bayes", "Decision Tree", "SVM","Random Forest","Logistic Regression"]
for model, name in zip(models, names):
    print(name)
    for score in ["accuracy","precision","recall","f1","roc_auc"]:
        print(score, ":", cross_val_score(model, x_rus, y_rus,scoring=score, cv=10).mean())
    print(" ")

In [None]:
#SMOTE
models = [GaussianNB(), DecisionTreeClassifier(), SVC(C=1,gamma =1,kernel='rbf'),RandomForestClassifier(),LogisticRegression()]
names = ["Naive Bayes", "Decision Tree", "SVM","Random Forest","Logistic Regression"]
for model, name in zip(models, names):
    print(name)
    for score in ["accuracy","precision","recall","f1","roc_auc"]:
        print(score, ":", cross_val_score(model, x_smote, y_smote,scoring=score, cv=10).mean())
    print(" ")

Naive Bayes
accuracy : 0.929646530493988
precision : 0.8894346499523114
recall : 0.9915254237288135
f1 : 0.9358626536216578
roc_auc : 0.9929811092729992
 
Decision Tree
accuracy : 1.0
precision : 1.0
recall : 1.0
f1 : 1.0
roc_auc : 1.0
 
SVM
accuracy : 0.9991525423728813
precision : 1.0
recall : 0.9983050847457626
f1 : 0.9991452991452991
roc_auc : 1.0
 
Random Forest
accuracy : 1.0
precision : 1.0
recall : 1.0
f1 : 1.0
roc_auc : 1.0
 
Logistic Regression
accuracy : 0.9771041576126323
precision : 0.9604022132146364
recall : 1.0
f1 : 0.978787681914359
roc_auc : 0.9855214018960069
 


### since from this evaluation we found out that SMOTE present us a higher result, thus we are continueing using x_smote and y_smote as the variable in the next step

#Identify the Best Parameter ( Hyperparameter Tuning )

## Hyperparameter Tuning for Naive Bayes

In [None]:
a = np.logspace(0,-9, num=10)

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

cv_method = RepeatedStratifiedKFold(n_splits=5, 
                                    n_repeats=3, 
                                    random_state=999)

In [None]:
from sklearn.preprocessing import PowerTransformer
naivebayes = GaussianNB()
naivebayes.fit(x_smote,y_smote)

params_NB = {'var_smoothing': a}

gs_NB = GridSearchCV(naivebayes,param_grid=params_NB, cv=cv_method,verbose=1, scoring='accuracy')

#Data_transformed = PowerTransformer().fit_transform(x_smote)

gs_NB.fit(x_smote, y_smote)

Fitting 15 folds for each of 10 candidates, totalling 150 fits


GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=999),
             estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06, 1.e-07,
       1.e-08, 1.e-09])},
             scoring='accuracy', verbose=1)

In [None]:
gs_NB.best_params_

{'var_smoothing': 1e-06}

## Hyperparameter Tuning for Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg_decision_model=DecisionTreeRegressor()
reg_decision_model.fit(x_smote,y_smote)

parameters={"splitter":["best","random"],
            "max_depth" : [1,3,5],
            "min_samples_split":[2,3,4,5],
            "min_samples_leaf":[1,2,3],
            "min_weight_fraction_leaf":[0.0,0.1,0.2,0.3],
            "min_impurity_decrease":[0.0,0.1,0.2,0.3,0.5],
            "ccp_alpha":[0.0,0.1,0.2,0.3,0.5]}
tuning_model=GridSearchCV(reg_decision_model,param_grid=parameters,scoring='neg_mean_squared_error',cv=3,verbose=3)
tuning_model.fit(x_smote,y_smote)
tuning_model.best_params_

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV 2/3] END ccp_alpha=0.3, max_depth=5, min_impurity_decrease=0.2, min_samples_leaf=2, min_samples_split=5, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.254 total time=   0.0s
[CV 3/3] END ccp_alpha=0.3, max_depth=5, min_impurity_decrease=0.2, min_samples_leaf=2, min_samples_split=5, min_weight_fraction_leaf=0.2, splitter=random;, score=-0.562 total time=   0.0s
[CV 1/3] END ccp_alpha=0.3, max_depth=5, min_impurity_decrease=0.2, min_samples_leaf=2, min_samples_split=5, min_weight_fraction_leaf=0.3, splitter=best;, score=-0.491 total time=   0.0s
[CV 2/3] END ccp_alpha=0.3, max_depth=5, min_impurity_decrease=0.2, min_samples_leaf=2, min_samples_split=5, min_weight_fraction_leaf=0.3, splitter=best;, score=-0.254 total time=   0.0s
[CV 3/3] END ccp_alpha=0.3, max_depth=5, min_impurity_decrease=0.2, min_samples_leaf=2, min_samples_split=5, min_weight_fraction_leaf=0.3, splitter=best;, score=-0.562 total time=   0

{'ccp_alpha': 0.0,
 'max_depth': 3,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'splitter': 'best'}

In [None]:
tuning_model.best_params_

{'ccp_alpha': 0.0,
 'max_depth': 3,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'splitter': 'best'}

## Hypeparameter Tuning SVM

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 10, 100, 1000],
			'gamma': [0.1,0.2,0.3,0.4,0.5,1],
			'kernel': ['rbf','linear']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

grid.fit(x_smote, y_smote)

Fitting 5 folds for each of 104 candidates, totalling 520 fits
[CV 1/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.996 total time=   0.0s
[CV 2/5] END ........C=1, gamma=0.1, kernel=rbf;, score=1.000 total time=   0.0s
[CV 3/5] END ........C=1, gamma=0.1, kernel=rbf;, score=1.000 total time=   0.0s
[CV 4/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.974 total time=   0.0s
[CV 5/5] END ........C=1, gamma=0.1, kernel=rbf;, score=1.000 total time=   0.0s
[CV 1/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.941 total time=   0.1s
[CV 2/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.992 total time=   0.0s
[CV 3/5] END .....C=1, gamma=0.1, kernel=linear;, score=1.000 total time=   0.0s
[CV 4/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.962 total time=   0.0s
[CV 5/5] END .....C=1, gamma=0.1, kernel=linear;, score=0.991 total time=   0.1s
[CV 1/5] END ........C=1, gamma=0.2, kernel=rbf;, score=0.996 total time=   0.0s
[CV 2/5] END ........C=1, gamma=0.2, kernel=rb

In [None]:
# best parameter after tuning
print(grid.best_params_)
 
# SVM model after tuning
print(grid.best_estimator_)

## Hyperparameter Tuning for Random Forest Classifier

In [None]:
# Define Parameters
max_depth=[2, 8, 16]
n_estimators = [64, 128, 256]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)

# Build the grid search
dfrst = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
grid = GridSearchCV(estimator=dfrst, param_grid=param_grid, cv = 5)
grid_results = grid.fit(x_smote, y_smote)

# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [0.99220779 0.9987013  0.9974026  1.         0.9987013  0.9987013
 0.9987013  1.         1.        ], using {'max_depth': 8, 'n_estimators': 64}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.090719,0.004398,0.009036,0.000289,2,64,"{'max_depth': 2, 'n_estimators': 64}",0.974026,1.0,1.0,0.987013,1.0,0.992208,0.01039,9
1,0.183003,0.005738,0.018218,0.003604,2,128,"{'max_depth': 2, 'n_estimators': 128}",0.993506,1.0,1.0,1.0,1.0,0.998701,0.002597,4
2,0.358089,0.00434,0.031303,0.002829,2,256,"{'max_depth': 2, 'n_estimators': 256}",1.0,0.987013,1.0,1.0,1.0,0.997403,0.005195,8
3,0.091913,0.003265,0.009552,0.000325,8,64,"{'max_depth': 8, 'n_estimators': 64}",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
4,0.180164,0.00722,0.016201,0.000955,8,128,"{'max_depth': 8, 'n_estimators': 128}",1.0,1.0,1.0,1.0,0.993506,0.998701,0.002597,4
5,0.348188,0.009472,0.032422,0.004237,8,256,"{'max_depth': 8, 'n_estimators': 256}",1.0,1.0,1.0,1.0,0.993506,0.998701,0.002597,4
6,0.090283,0.00205,0.009087,0.000158,16,64,"{'max_depth': 16, 'n_estimators': 64}",1.0,1.0,1.0,1.0,0.993506,0.998701,0.002597,4
7,0.179471,0.004569,0.015706,0.0003,16,128,"{'max_depth': 16, 'n_estimators': 128}",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1
8,0.356757,0.003369,0.03265,0.004143,16,256,"{'max_depth': 16, 'n_estimators': 256}",1.0,1.0,1.0,1.0,1.0,1.0,0.0,1


## Hyperparameter Tuning for Log Regression

In [None]:
from sklearn.model_selection import GridSearchCV

model = LogisticRegression()

# define parameter values
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['none', 'l1', 'l2', 'elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.001]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='accuracy', error_score=0)
grid_result = grid_search.fit(x_smote, y_smote)

# summarize results
print(f"Best score on training data: {grid_result.best_score_} using {grid_result.best_params_}")

#Checking the best classification model ( after tuning )

In [None]:
models = [GaussianNB(var_smoothing = 1e-05), DecisionTreeClassifier(ccp_alpha= 0.0, max_depth = 3, min_impurity_decrease = 0.0,min_samples_leaf = 1,min_samples_split = 2, min_weight_fraction_leaf= 0.0, splitter ='best'), SVC(C=1,gamma =0.2,kernel='rbf'),RandomForestClassifier(max_depth =  2, n_estimators = 64),LogisticRegression(C =100, penalty =  'none', solver = 'lbfgs')]
names = ["Naive Bayes", "Decision Tree", "SVM","Random Forest","Logistic Regression"]
for model, name in zip(models, names):
    print(name)
    for score in ["accuracy","precision","recall","f1","roc_auc"]:
        print(score, ":", cross_val_score(model, x_smote, y_smote,scoring=score, cv=10).mean())
    print(" ")

Naive Bayes
accuracy : 0.9491163262349703
precision : 0.9393917436411409
recall : 0.9694623027469316
f1 : 0.9521685067112353
roc_auc : 0.9932971104220943
 
Decision Tree
accuracy : 1.0
precision : 1.0
recall : 1.0
f1 : 1.0
roc_auc : 1.0
 
SVM
accuracy : 0.9991525423728813
precision : 1.0
recall : 0.9983050847457626
f1 : 0.9991452991452991
roc_auc : 1.0
 
Random Forest
accuracy : 1.0
precision : 1.0
recall : 1.0
f1 : 0.9913385826771653
roc_auc : 1.0
 
Logistic Regression


  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ign

accuracy : 0.9991452991452991


  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"


precision : 0.9983050847457626


  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"


recall : 1.0


  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"


f1 : 0.9991452991452991
roc_auc : 1.0
 


  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "Setting penalty='none' will ignore the C and l1_ratio parameters"


# Classifier Based on User Input

In [None]:
import pandas as pd

random = RandomForestClassifier(max_depth =  2, n_estimators = 64)
random.fit(x_smote,y_smote)
tree = DecisionTreeClassifier(ccp_alpha= 0.0, max_depth = 3, min_impurity_decrease = 0.0,min_samples_leaf = 1,min_samples_split = 2, min_weight_fraction_leaf= 0.0, splitter ='best')
tree.fit(x_smote, y_smote)
svm = SVC(C=1,gamma =1,kernel='rbf')
svm.fit(x_smote,y_smote)

print("Hey, ini adalah CHATEKSADA Classifier")
print("Classifier ini berbasis model klasifikasi Decison Tree, Random Forest, dan juga Support Machine Vector")
print(" ")
#cek = int(input("Jumlah pengecekan:"))
userinput = pd.DataFrame(index = range(1),columns = ["Nama","SKS","IPK","dum_grade"])


Nama = input("Nama: ",)
userinput.loc[0,"Nama"] = Nama
SKS = input("SKS: ",)
userinput.loc[0,"SKS"] = SKS
IPK = input("IPK: ",)
userinput.loc[0,"IPK"] = IPK
Grade = input("Jumlah nilai D/E/F: ",)
userinput.loc[0,"dum_grade"] = Grade

data = userinput.copy()
data = data.drop(columns =['Nama'])
randompredict = random.predict(data)
treepredict = tree.predict(data)
svmpredict = svm.predict(data)

print(" ")

if svmpredict == 1 and treepredict == 1 and randompredict == 1:
  print(userinput.loc[0,'Nama'],"telah mengambil",userinput.loc[0,'SKS'],"SKS, dengan IPK",userinput.loc[0,'IPK'],",dan tidak ada nilai dibawah C, maka diprediksi LULUS DALAM 4 TAHUN / KURANG")
else:
  print(userinput.loc[0,'Nama'],"diprediksi LULUS LEBIH DARI 4 TAHUN dikarenakan tidak memenuhi kriteria kelulusan")

Hey, ini adalah CHATEKSADA Classifier
Classifier ini berbasis model klasifikasi Decison Tree, Random Forest, dan juga Support Machine Vector
 
Nama: e
SKS: 145
IPK: 4
Jumlah nilai D/E/F: 3
 
e diprediksi LULUS LEBIH DARI 4 TAHUN dikarenakan tidak memenuhi kriteria kelulusan


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
