In [63]:
# This is inference step to execute the hosted model with custom input 

In [60]:
def input_fn(input_data, content_type):
    """Parse input data payload

    We currently only take csv input. Since we need to process both labelled
    and unlabelled data we first determine whether the label column is present
    by looking at how many columns were provided.
    """
    if content_type == 'text':
        # Read the raw input data as CSV.
        df = pd.read_csv(StringIO(input_data), 
                         header=None)

        if len(df.columns) == len(feature_columns_names) + 1:
            # This is a labelled example, includes the ring label
            df.columns = feature_columns_names + [label_column]
        elif len(df.columns) == len(feature_columns_names):
            # This is an unlabelled example.
            df.columns = feature_columns_names

        return df
    else:
        raise ValueError("{} not supported by script!".format(content_type))

In [3]:
import pandas as pd

In [61]:
# Provide the text input for which infernece needs to be performed
text = "credit card was stolen"

In [77]:
text = "my loan application is not approved"

In [78]:
# Pre process the input text 
!pip install spacy
!python -m spacy download en_core_web_sm

In [79]:
# Function to trim the text 
import re, nltk, spacy, string
pd.options.mode.chained_assignment = None  
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

In [80]:
#Function to Lemmatize the text
import en_core_web_sm
nlp = en_core_web_sm.load()
def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

In [81]:
text_clean = clean_text(text)

In [82]:
text_clean = clean_text(text_clean)

In [83]:
#Chunking in NLP is a process to take small pieces of information and group them into large units. The primary use of Chunking is making groups of "noun phrases.
#Here we are using only noun, singular as we have already lemmatized the texts.
import pandas as pd
!pip install TextBlob
from textblob import TextBlob
nltk.download('punkt')

def pos_tag(text):
    try:
        return TextBlob(text).tags
    except:
        return None

def get_adjectives(text):
    blob = TextBlob(text)
    return ' '.join([ word for (word,tag) in blob.tags if tag == "NN"])



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [84]:
!python -m textblob.download_corpora

In [85]:
text_clean = get_adjectives(text_clean)
text_clean=text_clean.replace('-PRON-', '')
text_clean=text_clean.replace('xxxx', '')



In [86]:
import io
data = io.StringIO(text_clean)
df = pd.read_csv(data)


In [87]:

# Load the count vector used during training 
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#GET VECTOR COUNT

transformer = TfidfTransformer()
loaded_vec = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("count_vector.pkl", "rb")))

tfidf = transformer.fit_transform(loaded_vec.fit_transform(df))



In [88]:
tfidf=pd.DataFrame(tfidf.todense())


In [89]:
payload=tfidf.to_csv(header=False, index=False)

In [90]:
payload = tfidf.iloc[:1].to_csv(header=False, index=False)

In [91]:

# Provide the end point where the model is hosted

endpoint_name="TicketClassificationNLP-staging"

from sagemaker.predictor import Predictor
predictor = Predictor(endpoint_name=endpoint_name1)

p = predictor.predict(payload, initial_args={"ContentType": "text/csv"})
print(p.decode("utf-8"))



4.0


In [92]:
Topic_names = {0:"Bank Account services", 1:"Credit card or prepaid card", 2:"Others", 3:"Theft/Dispute Reporting", 4:"Mortgage/Loan"}

In [93]:
# Convert class out put to mapped names
Topic_names[int(float(p.decode("utf-8")))]

'Mortgage/Loan'

In [53]:
endpoint_name1 = 'NLPComplaintClassification-staging'
