# Train a Simple Regression Model with SKLearn (Locally)
#### Use the Iris dataset to predict Sepal Length for Flowers

In [89]:
# Import Dataset from sklearn
from sklearn.datasets import load_iris# Load Iris Data
import pandas as pd

iris = load_iris()# Creating pd DataFrames
iris_df = pd.DataFrame(data= iris.data, columns= iris.feature_names)
target_df = pd.DataFrame(data= iris.target, columns= ['species'])

def converter(specie):
    if specie == 0:
        return 'setosa'
    elif specie == 1:
        return 'versicolor'
    else:
        return 'virginica'
    
target_df['species'] = target_df['species'].apply(converter)# Concatenate the DataFrames
iris_df = pd.concat([iris_df, target_df], axis= 1)
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


#### Use a Regression model since we have continuous (real number) output. Think of simple Linear Regression from Stats.

In [90]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Converting Objects to Numerical dtype
iris_df.drop('species', axis= 1, inplace= True)
target_df = pd.DataFrame(columns= ['species'], data= iris.target)
iris_df = pd.concat([iris_df, target_df], axis= 1)

# Variables
X= iris_df.drop(labels= 'sepal length (cm)', axis= 1)
y= iris_df['sepal length (cm)']

# Splitting the Dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33, random_state= 101)

# Instantiating LinearRegression() Model
lr = LinearRegression()

# Training/Fitting the Model
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

#### Predict Sepal Length on one data point

In [92]:
iris_df.loc[6]

d = {'sepal width (cm)' : [3.4],
    'petal length (cm)' : [1.4],
    'petal width (cm)' : [0.3],
    'species' : 0}

test_df = pd.DataFrame(data= d)
pred = lr.predict(test_df)

print('Predicted Sepal Length (cm):', pred[0])
print('Actual Sepal Length (cm):', 4.6)

Predicted Sepal Length (cm): 4.88749921150266
Actual Sepal Length (cm): 4.6


### Persist model in .pkl format (Serielization)

In [93]:
import os
import pickle

filename = "linear-regression-model.pkl"
pickle.dump(lr, open(filename, 'wb'))
        
print('Model Persisted.')

Model Persisted.


### Load persisted Model into memory and make a prediction (Deserielization)

In [96]:
load_lr_model =pickle.load(open(filename, 'rb'))
print('Model loaded into memory.')

Model loaded into memory.


In [95]:
iris_df.loc[6]

d = {'sepal width (cm)' : [3.4],
    'petal length (cm)' : [1.4],
    'petal width (cm)' : [0.3],
    'species' : 0}

test_df = pd.DataFrame(data= d)
pred = load_lr_model.predict(test_df)

print('Predicted Sepal Length (cm):', pred[0])
print('Actual Sepal Length (cm):', 4.6)

Predicted Sepal Length (cm): 4.88749921150266
Actual Sepal Length (cm): 4.6


In [120]:
test_array = [[3.4, 1.4,0.3,0]]
pred = load_lr_model.predict(test_array)

print('Predicted Sepal Length (cm):', pred[0])
print('Actual Sepal Length (cm):', 4.6)

Predicted Sepal Length (cm): 4.88749921150266
Actual Sepal Length (cm): 4.6


In [121]:
!tar -czvf model.tar.gz ./linear-regression-model.pkl

./linear-regression-model.pkl
