In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_union
from scipy.sparse import hstack
from sklearn import metrics
from gensim.models.keyedvectors import KeyedVectors
import warnings
import joblib # for deserialization saved models 


In [2]:
from mlserving import ServingApp
from mlserving.predictors import RESTPredictor

import joblib # for deserialization saved models 


class MyPredictor(RESTPredictor):
    def __init__(self):
        # Loading a saved model
        self.gloveModel = joblib.load('gloveModel.pkl')
        self.vectorizer = joblib.load('vectorizer.pkl')
        self.feature_names = self.vectorizer.get_feature_names()
        self.lr_insult = joblib.load('lr_insult.pkl')
        self.lr_obscene = joblib.load('lr_obscene.pkl')
        self.lr_threat = joblib.load('lr_threat.pkl')
        self.lr_toxic = joblib.load('lr_toxic.pkl')
        self.lr_identity_hate = joblib.load('lr_identity_hate.pkl')

    def get_word_weight(self, text):
        """Returns a dictionary where keys are the words of the text and values are their weights."""
        tfidf_matrix = self.vectorizer.transform([text]).todense()
        feature_index = tfidf_matrix[0,:].nonzero()[1]
        tfidf_scores = zip([self.feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
        return dict(tfidf_scores)
        
    def num_upper(self, text):
        """Returns the number of capital letters in a string."""
        num = 0
        for i in text:
            if i.isupper():
                num += 1
        return num
    
    def weighted_vector_mean(self, text):
        """Gets the weighted vector mean of a sentence by averaging the word vectors according to Tfidf weights."""
        sentence_vects = []
        sentence_weights = []
        words = text.split(" ")
        words = [word for word in words if word in self.gloveModel.wv.vocab]

        text_dict = self.get_word_weight(text)
        total = sum(text_dict.values())
        text_dict = {key:(val/total) for key,val in text_dict.items()}

        for word in words:
            sentence_vects.append(self.gloveModel[word])               # get word vectors
            if word.lower() in text_dict.keys():
                sentence_weights.append(text_dict[word.lower()])   # get weights of words
            else:
                sentence_weights.append(0)

        if len(sentence_vects) > 0:
            return np.transpose(sentence_vects) @ sentence_weights / len(sentence_vects)
        else:
            return np.zeros(300)

    def create_df(self, text):
        txt = text
        d = {'text': [txt]}
        df = pd.DataFrame(data=d)
        return df
    
    def generate_features(self, df):
        dfc = df.copy()
        # Cleaning text
        dfc['text'] = dfc['text'].str.replace(r"[(\.),(\|)!:='&(\*)(\")]", "")
        dfc['text'] = dfc['text'].str.replace("\n", "")

        # Getting length
        dfc['len'] = dfc['text'].apply(len) - dfc['text'].str.count(" ")
        len_min = 0
        len_max = 127
        dfc['len'] = (dfc['len'].values - len_min) / (len_max - len_min)

        # Getting proportion of caps
        dfc['caps'] = dfc['text'].apply(self.num_upper)
        dfc['proportion of caps'] = dfc['caps'] / dfc['len']

        # Accounting for division by 0
        dfc['proportion of caps'] = dfc['proportion of caps'].fillna(0)

        # Adding the 300D vector means, weighted by Tfidf weights
        dfc['vector mean'] = dfc['text'].apply(self.weighted_vector_mean)
        tmp = pd.DataFrame(dfc['vector mean'].tolist())
        dfc = dfc.join(tmp)
        dfc = dfc.drop(['vector mean', 'text', 'caps'], axis=1)
        return dfc
    
    def pre_process(self, input_data, req):
        text = input_data['features']
        df_text = self.create_df(text)
        dfc_text = self.generate_features(df_text)
        t_text = df_text['text']
        t_vector = self.vectorizer.transform(t_text)
        final_testing = hstack([t_vector, dfc_text[['len', 'proportion of caps']]])
        return final_testing
#         return input_data['features']

    def predict(self, processed_data, req):
        print(type(processed_data))
        print(processed_data.shape)
        # print(self.lr_obscene.predict(final_testing))
        # print(self.lr_toxic.predict(final_testing))
        # print(self.lr_identity_hate.predict(final_testing))
        # print(self.lr_threat.predict(final_testing))
        # print(self.lr_insult.predict(final_testing))
        return self.model.polarity_scores(processed_data)

    def post_process(self, prediction, req):
        return {'results': prediction}



In [None]:
app = ServingApp()
app.add_inference_handler('/api/v1/predict', MyPredictor())
app.run()



[2020-11-30 20:19:40,904] - INFO - Running development server on: http://0.0.0.0:5000/


127.0.0.1 - - [30/Nov/2020 21:11:35] "OPTIONS /api/v1/predict HTTP/1.1" 200 0
127.0.0.1 - - [30/Nov/2020 21:11:35] "POST /api/v1/predict HTTP/1.1" 500 515


<class 'scipy.sparse.coo.coo_matrix'>
(1, 357956)


127.0.0.1 - - [30/Nov/2020 21:11:36] "POST /api/v1/predict HTTP/1.1" 500 515


<class 'scipy.sparse.coo.coo_matrix'>
(1, 357956)
