<a href="https://colab.research.google.com/github/sadullahmath/Exercise/blob/master/Train_and_Deploy_an_Income_Predictor_Model_Using_Flask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train and Deploy an Income Predictor Model Using Flask

In [0]:
import pandas as pd
import joblib
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [0]:
#Import the pandas, pickle, joblib, and RandomForestClassifier packages from sklearn.ensemble, as well as train_test_split from sklearn.model_selection:

In [0]:
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter18/Dataset/phpMawTba.csv'

In [0]:
df = pd.read_csv(file_url)

In [0]:
df = pd.read_csv(file_url)

In [0]:
# Extract the 'class' response variable using the .pop() method and save it into a variable called y:

In [0]:
y = df.pop('class')

In [0]:
#Create a list called cat_columns containing only the columns of type 'object' using the dtype attribute and print its content

In [6]:
cat_columns = [col for col in df.columns if df[col].dtype == 'object']
cat_columns

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'sex',
 'native-country']

In [0]:
# Split the df and y DataFrames into training and test sets using the train_test_split function with the parameters test_size=0.33 and random_state=8:

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=8)

In [0]:
column_categories = {}

In [0]:
# Iterate through cat_columns and populate the dictionary with the column name and the list of categories using the .astype() method and the .cat.categories attribute:

In [0]:
for col in cat_columns:
  column_categories[col] = X_train[col].astype('category').cat.categories

In [0]:
#Save column_categories and cat_columns into files called categories_data.pkl and categorical_columns.pkl respectively using the pickle.dump() method:

In [0]:
pickle.dump(column_categories, open("categories_data.pkl", "wb"))
pickle.dump(cat_columns, open("categorical_columns.pkl", "wb"))

In [0]:
#Create a function called apply_categories that takes a DataFrame and a dictionary as inputs and will import CategoricalDtype from pandas.api.types, iterate through this dictionary, 
# and convert each column (keys) with the list of categories (values) using the .astype() method and CategoricalDtype:

In [0]:
def apply_categories(input_df, cat_dict):
  from pandas.api.types import CategoricalDtype
  for col, cat in cat_dict.items():
    input_df[col] = input_df[col].astype(CategoricalDtype(categories=cat))
  return input_df

In [0]:
# Apply this function on X_train and column_categories and save the result in a new DataFrame called X_train_cat. Print the data type of its columns using the .dtypes attribute:

In [12]:
X_train_cat = apply_categories(X_train, column_categories)
X_train_cat.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


age                  int64
workclass         category
fnlwgt               int64
education         category
education-num        int64
marital-status    category
occupation        category
relationship      category
sex               category
capital-gain         int64
capital-loss         int64
hours-per-week       int64
native-country    category
dtype: object

In [0]:
#Perform one-hot encoding on the categorical columns using the .get_dummies() method and save the result into a new variable called X_train_final:

In [0]:
X_train_final = pd.get_dummies(X_train_cat, columns=cat_columns)

In [0]:
# Instantiate a RandomForestClassifier with random_state=8 and train it with the training sets using the .fit() method. 
# Save the model into a file called model.pkl using the joblib.dump() method:

In [14]:
rf_model = RandomForestClassifier(random_state=8)
rf_model.fit(X_train_final, y_train)
joblib.dump(rf_model, "model.pkl")

['model.pkl']

In [0]:
# Import the socket, threading, requests, json, and numpy packages, the Flask class, and the jsonify and request functions from the flask package:

In [0]:
import socket
import threading
import requests
import json
from flask import Flask, jsonify, request
import numpy as np

In [0]:
# Create a new Flask app and save it into a variable called app:

In [0]:
app = Flask(__name__)

In [0]:
# Load the pre-trained model from the model.pkl file using joblib.load() and save it into a variable called trained_model. 
# Load the saved dictionary from categories_data.pkl using pickle.load() and save it into a variable called var_means:

In [0]:
trained_model = joblib.load("model.pkl")
var_means = pickle.load(open("categories_data.pkl", "rb"))
cat_cols = pickle.load(open("categorical_columns.pkl", "rb"))

In [0]:
# Create an API endpoint for the api path that accepts only POST requests and will call a function called predict().
# This function will read the JSON received using the request.get_json() method, transform it into a DataFrame, 
# apply the apply_categories() function on it with var_means, perform one-hot encoding with .get_dummies(), 
# predict the outcome with trained_model, convert the prediction from a numpy array to a string with array2string(), and then convert to JSON with jsonify():

In [0]:
@app.route('/api', methods=['POST']) 
def predict(): 
  data = request.get_json() 
  df_test = pd.DataFrame(data, index=[0]) 
  df_test_clean = apply_categories(df_test, var_means) 
  df_test_final = pd.get_dummies(df_test_clean, columns=cat_cols) 
  prediction = trained_model.predict(df_test_final) 
  str_pred = np.array2string(prediction) 
  return jsonify(str_pred) 

In [0]:
# Create a new thread for running your Flask app using the threading.
# Thread method with the following parameters: target=app.run, kwargs={'host':'0.0.0.0','port':80}:

In [19]:
flask_thread = threading.Thread(target=app.run, kwargs={'host':'0.0.0.0','port':80})
flask_thread.start()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://0.0.0.0:80/ (Press CTRL+C to quit)


In [0]:
# Select the first record of X_test and convert it into JSON format using the .to_json() method:

In [20]:
record = X_test.iloc[0,].to_json()
record

'{"age":51,"workclass":" Private","fnlwgt":106151,"education":" 11th","education-num":7,"marital-status":" Divorced","occupation":" Transport-moving","relationship":" Own-child","sex":" Male","capital-gain":0,"capital-loss":0,"hours-per-week":40,"native-country":" United-States"}'

In [0]:
# Create a dictionary called headers with the following key-value pairs: 'content-type': 'application/json', 'Accept-Charset': 'UTF-8'. 
# Extract into a new variable called ip_address the IP address of the host using the socket.gethostname() and socket.gethostbyname() methods:

In [0]:
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
ip_address = socket.gethostbyname(socket.gethostname())

In [0]:
# Send an HTTP POST request to the server using the requests.post() method with the HTTP URL to the endpoint, using record and headers as its parameters, and print its .text attribute:

In [22]:
r = requests.post(f"http://{ip_address}/api", data=record, headers=headers)
r.text

172.28.0.2 - - [11/May/2020 21:38:55] "[37mPOST /api HTTP/1.1[0m" 200 -


'"[\' <=50K\']"\n'

In [0]:
# From the output, we observe that the POST request was successful: the server returned the code 200. 
# We received the prediction from the model for the record we sent, and it has predicted the person has an income below the 50k mark.