In [7]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris


#load the data from sklearn.datasets
data     = load_iris()


#divide the data into the input 'X' and the labels 'y'
X        = data['data'] #the observations
y        = data['target'] #the label

#load the data in a pandas dataframe
df          = pd.DataFrame(X, columns=['sepal length', 'sepal width', 'petal length', 'petal width'])
df['class'] = [data['target_names'][idx] for idx in y]

#split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split( 
                                        #TODO: enter the appropriate values to partition the data, 
                                        #and ensure your results are reproducible
                                        #and obtain stratified samples
                                                    X,              #the input features
                                                    y,              #the label
                                                    test_size=0.3,  #set aside 25% of the data as the test set
                                                    random_state=7, #reproduce the results
                                                    stratify=y      #preserve the distribution of the labels
                                        ) 
df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
X_train = X_train.copy()
X_test  = X_test.copy()

In [11]:
# #encode the categorical feature
# encoder = OneHotEncoder(sparse=False, handle_unknown='ignore') #drop one of the encoded gender columns
# encoder.fit(X_train[['class']])
# #enc.categories_ #view the categories

# X_train['class'] = encoder.transform(X_train[['class']])
 

In [1]:
#normalize the numeric features
scaler = StandardScaler()
#scaler.fit(X_train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

scaler.fit(X_train)


# X_train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] = scaler.transform(X_train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]) 
X_train = scaler.transform(X_train) 


#scale the training data

NameError: name 'StandardScaler' is not defined

In [18]:
#view the prepared data
# X_train.head(10)

#initialize the classifier

svm = SVC() #the default kernel is rbf
svm.fit(X_train, y_train) #fit the data

SVC()

In [20]:
#predict the labels for the test set
y_pred   = svm.predict(X_test)

print('The predicted BMI is: {}'.format(y_pred))

The predicted BMI is: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0
 2 0 2 0 0 0 0 0]


In [21]:
#evaluate the prediction accuracy
print('The accuracy of the model is: {}'.format(svm.score(X_test, y_test)))

The accuracy of the model is: 0.26666666666666666


In [23]:
#get the classification report
target_names = ['Iris Setosa', 'Iris Virginica', 'Iris Versicolor']
print(classification_report(y_test, y_pred, target_names = target_names))

                 precision    recall  f1-score   support

    Iris Setosa       0.29      0.80      0.42        15
 Iris Virginica       0.00      0.00      0.00        15
Iris Versicolor       0.00      0.00      0.00        15

       accuracy                           0.27        45
      macro avg       0.10      0.27      0.14        45
   weighted avg       0.10      0.27      0.14        45



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
#setup the parameter grid to try values of c ranging from 1 to 10, and values for gamma ranging from 1 to 5.
params = {'C': np.arange(1, 10),
         'gamma': np.arange(1, 5)}

svm = SVC()

#initialize Grid Search with Cross Validation
svm_grid = GridSearchCV(estimator = svm, param_grid = params, cv=5)
svm_grid.fit(X_train, y_train)

print(svm_grid.best_params_) #print the best parameters
print(svm_grid.best_score_)  #print the best score

{'C': 1, 'gamma': 2}
0.9523809523809523


In [25]:
params = {'C': np.arange(1, 10),
         'gamma': np.arange(1, 5)}

svm = SVC()

#initialize Grid Search with Cross Validation and return the training score
svm_grid = GridSearchCV(estimator = svm, param_grid = params, cv=5, return_train_score = True) 
svm_grid.fit(X_train, y_train)

print(svm_grid.best_params_)
print(svm_grid.best_score_)

{'C': 1, 'gamma': 2}
0.9523809523809523


In [26]:
#display the cv results for the train and test set
cv_results = pd.DataFrame(svm_grid.cv_results_)
cv_results = cv_results.sort_values('mean_test_score', ascending=False)
cv_results[['mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score']].head()

Unnamed: 0,mean_train_score,std_train_score,mean_test_score,std_test_score
4,0.995238,0.005832,0.952381,0.030117
1,0.992857,0.005832,0.952381,0.030117
0,0.985714,0.008909,0.942857,0.035635
2,1.0,0.0,0.942857,0.035635
5,1.0,0.0,0.942857,0.035635


In [None]:
# the highest mean_train_score is 1.0 and mean_test_score is 0.952381. 
# The SVC does not appear to overfit or underfit the data. This demonstrates low bias and low variance.

