## Prepare and Train

In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd

### Label Encoding
Assign an integer for each class.

In [15]:
df = pd.read_csv("final_Iris.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,4,5,5.0,3.6,1.4,0.2,Iris-setosa


Remove unnecessary columns.

In [16]:
df.drop(["Unnamed: 0","Id"],axis=1,inplace=True)
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [18]:
le = LabelEncoder()
df["Species"] = le.fit_transform(df["Species"])

### Split dataset into train and test

In [20]:
# Retrieve particular value with ".iloc".
# Species contains classes values therefore don't include it.
# Give classes as second variable of function.
X_train, X_test, Y_train, Y_test = train_test_split(df.iloc[:,:-1],df.iloc[:,-1],test_size=0.2,random_state=0) 

In [21]:
Y_train.value_counts()

2    44
0    39
1    37
Name: Species, dtype: int64

In [22]:
Y_test.value_counts()

1    13
0    11
2     6
Name: Species, dtype: int64

## Create Model

In [23]:
import xgboost as xgb

In [24]:
# Adjust objective according to the task.
# This is multiple classification with softmax. Some objectives:
# "reg:logistic" is logistic regression.
# "binary:logistic" is logistic regression for binary classification, the output is probability.
classifier = xgb.XGBClassifier(objective="multiclass:softmax",num_class=3)

### Train the model

In [25]:
classifier.fit(X_train,Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_class=3, num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=0, ...)

### Get predictions from test set

In [26]:
predictions = classifier.predict(X_test)
predictions

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0], dtype=int64)

In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [29]:
accuracy_score(Y_test,predictions)

1.0

In [30]:
confusion_matrix(Y_test,predictions)

array([[11,  0,  0],
       [ 0, 13,  0],
       [ 0,  0,  6]], dtype=int64)