In [2]:
import boto3
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'ryan-greiner-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'predictive_analytics/Iris.csv'
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading CSV file
iris = pd.read_csv(file_content_stream)
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
## frequency table ##
iris['Species'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [6]:
## Feature Engineering ##
iris['species_numb'] = np.where(iris['Species'] == 'Iris-virginica', 1,
                                np.where(iris['Species'] == 'Iris-versicolor', 2, 3))

In [7]:
## Define variables ##
X = iris.drop(columns = ['Id', 'Species', 'species_numb'], axis = 1)
Y = iris['species_numb']

## Split Data ##
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

In [9]:
## Scale Data ##
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Random Forest

In [14]:
## Model ##
rf = OneVsRestClassifier(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3)).fit(X_train, Y_train)

## Predictions ##
rf_pred = rf.predict_proba(X_test)

## Labels ##
rf_pred = np.argmax(rf_pred, axis = 1) + 1
print(classification_report(rf_pred, Y_test))

              precision    recall  f1-score   support

           1       1.00      0.91      0.95        11
           2       0.90      1.00      0.95         9
           3       1.00      1.00      1.00        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [15]:
## Model ##
svm = OneVsRestClassifier(estimator = SVC(kernel = 'rbf', C = .1, probability = True)).fit(X_train, Y_train)

## Predictions ##
svm_pred = svm.predict_proba(X_test)

## Labels ##
svm_pred = np.argmax(svm_pred, axis = 1) + 1
print(classification_report(svm_pred, Y_test))

              precision    recall  f1-score   support

           1       1.00      0.91      0.95        11
           2       0.90      1.00      0.95         9
           3       1.00      1.00      1.00        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [None]:
## Both models gave the same results 