In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_union
from scipy.sparse import hstack
from sklearn import metrics
from gensim.models.keyedvectors import KeyedVectors
import warnings
import joblib # for deserialization saved models 


In [11]:
from mlserving import ServingApp
from mlserving.predictors import RESTPredictor

import joblib # for deserialization saved models 


class MyPredictor(RESTPredictor):
    def __init__(self):
        # Loading a saved model
        self.gloveModel = joblib.load('gloveModel.pkl')
        self.vectorizer1 = joblib.load('vectorizer.pkl')
        self.vectorizer2 = joblib.load('vectorizer2.pkl')
        self.feature_names = self.vectorizer1.get_feature_names()
        self.lr_insult = joblib.load('lr_insult.pkl')
        self.lr_obscene = joblib.load('lr_obscene.pkl')
        self.lr_threat = joblib.load('lr_threat.pkl')
        self.lr_toxic = joblib.load('lr_toxic.pkl')
        self.lr_identity_hate = joblib.load('lr_identity_hate.pkl')

    def get_word_weight(self, text):
        """Returns a dictionary where keys are the words of the text and values are their weights."""
        tfidf_matrix = self.vectorizer1.transform([text]).todense()
        feature_index = tfidf_matrix[0,:].nonzero()[1]
        tfidf_scores = zip([self.feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
        return dict(tfidf_scores)
        
    def num_upper(self, text):
        """Returns the number of capital letters in a string."""
        num = 0
        for i in text:
            if i.isupper():
                num += 1
        return num
    
    def weighted_vector_mean(self, text):
        """Gets the weighted vector mean of a sentence by averaging the word vectors according to Tfidf weights."""
        sentence_vects = []
        sentence_weights = []
        words = text.split(" ")
        words = [word for word in words if word in self.gloveModel.wv.vocab]

        text_dict = self.get_word_weight(text)
        total = sum(text_dict.values())
        text_dict = {key:(val/total) for key,val in text_dict.items()}

        for word in words:
            sentence_vects.append(self.gloveModel[word])               # get word vectors
            if word.lower() in text_dict.keys():
                sentence_weights.append(text_dict[word.lower()])   # get weights of words
            else:
                sentence_weights.append(0)

        if len(sentence_vects) > 0:
            return np.transpose(sentence_vects) @ sentence_weights / len(sentence_vects)
        else:
            return np.zeros(300)

    def create_df(self, text):
        txt = text
        d = {'text': [txt]}
        df = pd.DataFrame(data=d)
        return df
    
    def generate_features(self, df):
        dfc = df.copy()
        # Cleaning text
        dfc['text'] = dfc['text'].str.replace(r"[(\.),(\|)!:='&(\*)(\")]", "")
        dfc['text'] = dfc['text'].str.replace("\n", "")

        # Getting length
        dfc['len'] = dfc['text'].apply(len) - dfc['text'].str.count(" ")
        len_min = 0
        len_max = 127
        dfc['len'] = (dfc['len'].values - len_min) / (len_max - len_min)

        # Getting proportion of caps
        dfc['caps'] = dfc['text'].apply(self.num_upper)
        dfc['proportion of caps'] = dfc['caps'] / dfc['len']

        # Accounting for division by 0
        dfc['proportion of caps'] = dfc['proportion of caps'].fillna(0)

        # Adding the 300D vector means, weighted by Tfidf weights
        dfc['vector mean'] = dfc['text'].apply(self.weighted_vector_mean)
        tmp = pd.DataFrame(dfc['vector mean'].tolist())
        dfc = dfc.join(tmp)
        dfc = dfc.drop(['vector mean', 'text', 'caps'], axis=1)
        return dfc
    
    def pre_process(self, input_data, req):
        text = input_data['features']
        print(text)
        df_text = self.create_df(text)
        dfc_text = self.generate_features(df_text)
        t_text = df_text['text']
        t_vector = self.vectorizer2.transform(t_text)
        final_testing = hstack([t_vector, dfc_text[['len', 'proportion of caps']]])
        return final_testing
#         return input_data['features']

    def predict(self, processed_data, req):
        count = 0
        results = {
          "Obscenity": False,
          "Toxicity": False,
          "Identity Hate": False,
          "Threat": False,
          "Insult": False,
          "Count": 0
        }
        if self.lr_obscene.predict(processed_data)[0] == 1:
            count += 1
            results["Obscenity"] = True
        if self.lr_toxic.predict(processed_data)[0] == 1:
            count += 1;
            results["Toxicity"] = True
        if self.lr_identity_hate.predict(processed_data)[0] == 1:
            count += 1;
            results["Identity Hate"] = True
        if self.lr_threat.predict(processed_data)[0] == 1:
            count += 1;
            results["Threat"] = True
        if self.lr_insult.predict(processed_data)[0] == 1:
            results["Insult"] = True
            count += 1;
        results["Count"] = count

        return results

    def post_process(self, prediction, req):
        return prediction



In [12]:
app = ServingApp()
app.add_inference_handler('/api/v1/predict', MyPredictor())
app.run()



[2020-12-02 22:03:15,681] - INFO - Running development server on: http://0.0.0.0:5000/
[2020-12-02 22:03:15,681] - INFO - Running development server on: http://0.0.0.0:5000/
[2020-12-02 22:03:15,681] - INFO - Running development server on: http://0.0.0.0:5000/
[2020-12-02 22:03:15,681] - INFO - Running development server on: http://0.0.0.0:5000/
[2020-12-02 22:03:15,681] - INFO - Running development server on: http://0.0.0.0:5000/
[2020-12-02 22:03:15,681] - INFO - Running development server on: http://0.0.0.0:5000/


127.0.0.1 - - [02/Dec/2020 22:03:30] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


Heyy How are you?


127.0.0.1 - - [02/Dec/2020 22:03:37] "POST /api/v1/predict HTTP/1.1" 200 109
127.0.0.1 - - [02/Dec/2020 22:03:58] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


You're a gay retard, go kill yourself


127.0.0.1 - - [02/Dec/2020 22:04:04] "POST /api/v1/predict HTTP/1.1" 200 107
127.0.0.1 - - [02/Dec/2020 22:04:41] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


You're a gay retard, go kill yourself. Fuck you you ugly piece of shit.


127.0.0.1 - - [02/Dec/2020 22:04:46] "POST /api/v1/predict HTTP/1.1" 200 106
127.0.0.1 - - [02/Dec/2020 22:05:10] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


You're a gay retard, go kill yourself. Fuck you you ugly piece of shit.


127.0.0.1 - - [02/Dec/2020 22:05:16] "POST /api/v1/predict HTTP/1.1" 200 106
127.0.0.1 - - [02/Dec/2020 22:07:29] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


Gay ugly piece of shit kill yourself


127.0.0.1 - - [02/Dec/2020 22:07:35] "POST /api/v1/predict HTTP/1.1" 200 106
127.0.0.1 - - [02/Dec/2020 22:08:09] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


Gay ugly


127.0.0.1 - - [02/Dec/2020 22:08:15] "POST /api/v1/predict HTTP/1.1" 200 107
127.0.0.1 - - [02/Dec/2020 22:08:59] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


gay ugly


127.0.0.1 - - [02/Dec/2020 22:09:05] "POST /api/v1/predict HTTP/1.1" 200 107
127.0.0.1 - - [02/Dec/2020 22:10:32] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


ugly gay


127.0.0.1 - - [02/Dec/2020 22:10:38] "POST /api/v1/predict HTTP/1.1" 200 107
127.0.0.1 - - [02/Dec/2020 22:11:22] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


I will murder you you piece of shit bastard


127.0.0.1 - - [02/Dec/2020 22:11:28] "POST /api/v1/predict HTTP/1.1" 200 106
127.0.0.1 - - [02/Dec/2020 22:11:37] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


ugly gay


127.0.0.1 - - [02/Dec/2020 22:11:43] "POST /api/v1/predict HTTP/1.1" 200 107
127.0.0.1 - - [02/Dec/2020 22:13:52] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


ugly gay


127.0.0.1 - - [02/Dec/2020 22:13:58] "POST /api/v1/predict HTTP/1.1" 200 107
127.0.0.1 - - [02/Dec/2020 22:15:56] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


ugly gay


127.0.0.1 - - [02/Dec/2020 22:16:02] "POST /api/v1/predict HTTP/1.1" 200 107
127.0.0.1 - - [02/Dec/2020 22:19:41] "OPTIONS /api/v1/predict HTTP/1.1" 200 0


ugly gay piece of shit go kill yourself


127.0.0.1 - - [02/Dec/2020 22:19:47] "POST /api/v1/predict HTTP/1.1" 200 106


KeyboardInterrupt: 