In [89]:
import pickle as pkl

from tqdm import tqdm
tqdm.pandas()

import yaml
import pandas as pd
import numpy as np
from numpy import triu
from nltk.tokenize import word_tokenize, TweetTokenizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
import nltk 
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [4]:
# Read in the processed data and the processed inbound dataset 
processed_df = pd.read_pickle("../data/processed/processed_v2.pkl")
processed_inbound_extra = processed_df["clean_inbound_text"]
processed_inbound_extra.head()

4     different people have given different answers ...
5     way to drop the ball on customer service so pi...
6     i want my amazon payments account closed dm me...
9     yeah this is crazy were less than a week away ...
10    how about you guys figure out my xbox one x pr...
Name: clean_inbound_text, dtype: object

In [5]:
# Reset the index. Set it properly in this iteration 
processed_df = processed_df.reset_index(drop=True)
processed_inbound_extra = processed_inbound_extra.reset_index(drop=True)

In [6]:
# Check for some null values 
processed_df.isnull().sum()

inbound_text               0
author_id                  0
created_at                 0
outbound_text              0
response_tweet_id      60327
inbound_lang               0
inbound_hashtags           0
outbound_hashtags          0
clean_inbound_text         0
clean_outbound_text        0
outbound_tokens_pos        0
inbound_tokens_pos         0
dtype: int64

In [7]:
# Read in the intents back 
with open(r"../objects/intents_amazon_support.yml") as file:
    intents = yaml.load(file, Loader=yaml.FullLoader)

# Previewing
print(f'\nintents:\n{intents}')
print(f'\nprocessed:\n{processed_df.head()}')


intents:
{'track': ['tracking', 'order', 'shipment', 'late', 'status', 'carrier', 'update', 'number', 'info', 'received', 'details'], 'support': ['service'], 'quality': ['quality', 'product', 'damaged', 'received', 'refund', 'return', 'issue', 'order', 'packaging', 'proper', 'working', 'expected', 'different'], 'discount': ['prime', 'product', 'offer', 'price', 'sale'], 'account': ['email', 'orders', 'details', 'bank', 'access']}

processed:
                                        inbound_text   author_id  \
0  @AmazonHelp 3 different people have given 3 di...  AmazonHelp   
1  Way to drop the ball on customer service @1158...  AmazonHelp   
2  @115823 I want my amazon payments account CLOS...  AmazonHelp   
3  @AmazonHelp @115826 Yeah this is crazy we’re l...  AmazonHelp   
4  @115828 How about you guys figure out my Xbox ...  AmazonHelp   

                  created_at  \
0  2017-10-31 23:28:00+00:00   
1  2017-10-31 22:29:00+00:00   
2  2017-10-31 22:28:34+00:00   
3  2017-11-01 12

## Tweet Collection with Doc2Vec 
I can use my Doc2Vec representation to find top 1000 Tweets most similar to a generalized intent version of a Tweet based on it's cosine similarity. 

Heuristic search refers to a search strategy that attempts to optimize a problem by iteratively improving the solution based on a given heuristic function or a cost measure. My cost measure is trying to get the closest cosine distances.

So I basically trained my doc2vec model with my training data, which is the `processed_inbound`. I can actually compute a vector based on my training data to vectorize that word.

## Training my Doc2Vec 

### Data Synthesis 
Basically, there are 2 ways I can get my current training data (1000 for each)
* **Doc2Vec:** Some intent examples I will synthetically generate from an idealized example using doc2vec
* **Manual:** Some intent examples I will synthetically generate by duplicating and manual (like greeting, because the current data does not represent this)
* **Hybrid:** Some intents I will do a hybrid approach, where 50 percent might be my generated data, and 50 percent might be 

In [82]:
#TODO: Add information about Doc2Vec or post a link to it over here 

In [84]:
# Creating a synthetic dataset - generating N Tweets resembling a mock tweet
# This will subsequently be merged with the existing inbound data for inclusion in the doc2vec training process

# Version 1 - will be improved in future iterations
ideal = {
            "order track": "@AmazonHelo Hi, could you provide an update on the order? Its been days since the product has moved from its last location ", # change intent to "order tracking"???
            "product inquiry": "@AmazonHelp Looking for more info on the product. Can you share details or direct me to a reliable source?", # product inquire??? 
            "return refund": "@AmazonHelp How can I start a return process? The item I received doesn't match the description.",
            "account management": "@AmazonHelp Hi, I am having trouble logging into my account. Can you help me reset my password?", 
            "promotion discount": "@AmazonHelp Are there any ongoing promotions or deals in the ongoing festive season? Looking to buy a few items.",
            "shipping": "@AmazonHelp Hi, My address has changed. Can you help me update the shipping address for my order?",
            "technical support": "@AmazonHelp Encountering errors during checkout. Can you help me troubleshoot the issue?",
            "payment issue": "@AmazonHelp My payment method isn't going through. Any suggestions on how to resolve this?",
            "general query": "@AmazonHelp Hi, I have a general question regarding the product. Can you help me with this?"
        }

# Version 2 - Here I will try writing some more intent items 
ideal_tag_dict = {
                    "track": "tracking order shipoment late status carrier update number info received details", 
                    "support": "support chat customer resolution feedback satisfaction",
                    "quality": "quality product damaged received refund return issue order packaging proper working expected different", 
                    "discount": "prime product offer price sale", 
                    "account": "email orders details bank access"
                }

In [86]:
processed_df

Unnamed: 0,inbound_text,author_id,created_at,outbound_text,response_tweet_id,inbound_lang,inbound_hashtags,outbound_hashtags,clean_inbound_text,clean_outbound_text,outbound_tokens_pos,inbound_tokens_pos
0,@AmazonHelp 3 different people have given 3 di...,AmazonHelp,2017-10-31 23:28:00+00:00,@115820 We'd like to take a further look into ...,619,en,[],[],different people have given different answers ...,wed like to take a further look into this with...,"[-PRON-: NOUN, d: VERB, like: VERB, to: NOUN, ...","[different: NOUN, people: NOUN, have: NOUN, gi..."
1,Way to drop the ball on customer service @1158...,AmazonHelp,2017-10-31 22:29:00+00:00,@115820 I'm sorry we've let you down! Without ...,616,en,[],[],way to drop the ball on customer service so pi...,i am sorry we have let you down without provid...,"[i: NOUN, be: NOUN, sorry: NOUN, -PRON-: NOUN,...","[way: NOUN, to: NOUN, drop: VERB, the: NOUN, b..."
2,@115823 I want my amazon payments account CLOS...,AmazonHelp,2017-10-31 22:28:34+00:00,@115822 I am unable to affect your account via...,,en,[],[],i want my amazon payments account closed dm me...,i am unable to affect your account via twitter...,"[i: NOUN, be: NOUN, unable: NOUN, to: NOUN, af...","[i: NOUN, want: VERB, -PRON-: NOUN, amazon: NO..."
3,@AmazonHelp @115826 Yeah this is crazy we’re l...,AmazonHelp,2017-11-01 12:53:34+00:00,@115827 Thanks for your patience. ^KM,,en,[],[],yeah this is crazy were less than a week away ...,thanks for your patience km,"[thank: NOUN, for: NOUN, -PRON-: NOUN, patienc...","[yeah: NOUN, this: NOUN, be: NOUN, crazy: NOUN..."
4,@115828 How about you guys figure out my Xbox ...,AmazonHelp,2017-10-31 22:28:00+00:00,@115826 I'm sorry for the wait. You'll receive...,627,en,[],[],how about you guys figure out my xbox one x pr...,i am sorry for the wait you will receive an em...,"[i: NOUN, be: NOUN, sorry: NOUN, for: NOUN, th...","[how: NOUN, about: NOUN, -PRON-: NOUN, guy: NO..."
...,...,...,...,...,...,...,...,...,...,...,...,...
122335,@AmazonHelp I sent you guys a DM regarding the...,AmazonHelp,2017-11-22 00:17:00+00:00,@328597 We're unable to access customer accoun...,,en,[],[],i sent you guys a dm regarding the status of m...,were unable to access customer accounts via so...,"[be: NOUN, unable: NOUN, to: NOUN, access: VER...","[i: NOUN, send: VERB, -PRON-: NOUN, guy: NOUN,..."
122336,This is happening in my area w/@115821 “Prime”...,AmazonHelp,2017-11-22 02:16:55+00:00,"@777901 I'm sorry for the delay, Brenda! We st...",2987557,en,[],[],this is happening in my area w prime deliverie...,i am sorry for the delay brenda we strive to s...,"[i: NOUN, be: NOUN, sorry: NOUN, for: NOUN, th...","[this: NOUN, be: NOUN, happen: VERB, in: NOUN,..."
122337,@132994 @132995 @115850 got my #OnePlus5T at 8...,AmazonHelp,2017-11-22 03:49:29+00:00,@823783 Woohoo! That's awesome! Hope you love ...,2987674,en,"[#AmazonPrime, #OnePlus5T]",[],got my at am thanks for fulfilling the order fast,woohoo that is awesome hope you love the phone js,"[woohoo: NOUN, that: NOUN, be: NOUN, awesome: ...","[get: VERB, -PRON-: NOUN, at: NOUN, be: NOUN, ..."
122338,@115850 @132994 No exchange available for #One...,AmazonHelp,2017-11-22 05:22:31+00:00,@823802 The Exchange Offer is currently availa...,,en,[#OnePlus5T],[],no exchange available for i need to exchange m...,the exchange offer is currently available only...,"[the: NOUN, exchange: NOUN, offer: NOUN, be: N...","[no: NOUN, exchange: NOUN, available: NOUN, fo..."


In [100]:
def add_extra(clean_inbound_text, extra_tweets):
    '''Adding extra tweets to the current tokenized data'''
    
    # Convert the extra tweets into a pandas Series
    extra_tweets = pd.Series(extra_tweets)

    ## The following 2 steps aren't required for now as the data is already in string format 
    ## Convert the extra tweets into a single string
    # print("Converting to string...")
    # string_processed_data = current_tokenized_data.progress_apply(" ".join)

    # Concatenate the extra tweets to the current data
    clean_inbound_text = pd.concat([clean_inbound_text, extra_tweets], axis = 0, ignore_index = True)

    # Tokenize the combined data
    tknzr = TweetTokenizer(strip_handles = True, reduce_len = True)
    print("Tokenizing...")
    tokenized_data = clean_inbound_text.progress_apply(tknzr.tokenize)
    string_processed_data = tokenized_data.progress_apply(" ".join)

    return string_processed_data

# Add the extra tweets to the current data
processed_inbound_extra = add_extra(processed_df["clean_inbound_text"], list(ideal_tag_dict.values()))

# Save the updated data to a pickle file
processed_inbound_extra.to_pickle("../objects/processed_inbound_extra.pkl")

processed_inbound_extra

Tokenizing...


100%|██████████| 122345/122345 [00:05<00:00, 22029.41it/s]
100%|██████████| 122345/122345 [00:00<00:00, 2024005.25it/s]


0         different people have given different answers ...
1         way to drop the ball on customer service so pi...
2         i want my amazon payments account closed dm me...
3         yeah this is crazy were less than a week away ...
4         how about you guys figure out my xbox one x pr...
                                ...                        
122340    tracking order shipoment late status carrier u...
122341    support chat customer resolution feedback sati...
122342    quality product damaged received refund return...
122343                       prime product offer price sale
122344                     email orders details bank access
Length: 122345, dtype: object

In [101]:
processed_inbound_extra[-7:]

122338    no exchange available for i need to exchange m...
122339    there should be bonus and gifts for regular cu...
122340    tracking order shipoment late status carrier u...
122341    support chat customer resolution feedback sati...
122342    quality product damaged received refund return...
122343                       prime product offer price sale
122344                     email orders details bank access
dtype: object

In [102]:
processed_inbound_extra.shape

(122345,)

In [103]:
# Train a Doc2Vec model on the entire corpus 
def train_doc2vec(string_data, max_epochs, vec_size, alpha):
     
    # Tagging each of the documents with a unique ID
    tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(string_data)]
    
    # Instantiating my model 
    model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1) # dm=1 means 'distributed memory' (PV-DM)
    
    # Building the vocabulary table
    model.build_vocab(tagged_data)
    
    for epoch in range(max_epochs): # Run for max_epochs
        print('iteration {0}'.format(epoch))    
        model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs) # This statement trains the model on the current epoch
        # Decreasing the learning rate
        model.alpha -= 0.0002
        # Fixing the learning rate, no decay
        model.min_alpha = model.alpha
        
    # Saving model
    model.save("../models/d2v.model")
    print("Model Saved")        
    
# Training
train_doc2vec(processed_inbound_extra, max_epochs=100, vec_size=20, alpha=0.025)

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [8]:
# Loading in the Doc2Vec model 
model = Doc2Vec.load("../models/d2v.model")

# Storing all inbound data into a list for clustering 
inbound_d2v = np.array([model.infer_vector(word_tokenize(_d.lower())) for _d in list(processed_inbound_extra)])

# Saving
with open("../objects/inbound_d2v.pkl", "wb") as f:
    pkl.dump(inbound_d2v, f)



In [9]:
inbound_d2v.shape

(122340, 20)

Before, we did not have a concept of distance in our vectorizers, they don't really have a specific meaning. This is a much better way because it captures the contextual representations between words! Now the clustering should be a lot better than tfidf or bag of words.

## Methodology

Initially, to get the top 1000 similar Tweets, I tried using the existing data. But I don't think that would yield the most accurate results because I am not capturing not the best representative Tweet for an intent. For that reason, I made all these base representative Tweets myself (as seen in the values derived from `ideal` dict above). The goal is to find trying to find an idealized, wholistic representation of an intent. Then from there I use my doc2vec representations to find the top 1000 tweets most similar based on cosine similarity.

### Package Exploration

In [10]:
intents

{'track': ['tracking',
  'order',
  'shipment',
  'late',
  'status',
  'carrier',
  'update',
  'number',
  'info',
  'received',
  'details'],
 'support': ['service'],
 'quality': ['quality',
  'product',
  'damaged',
  'received',
  'refund',
  'return',
  'issue',
  'order',
  'packaging',
  'proper',
  'working',
  'expected',
  'different'],
 'discount': ['prime', 'product', 'offer', 'price', 'sale'],
 'account': ['email', 'orders', 'details', 'bank', 'access']}

In [11]:
## Trying an example 
## Finding and making idealized versions of each tweet so that I can 
intents_eg = {"discount": ["prime", "product", "offer", "price", "sale"]}
inferred_vectors = []

for keywords in intents_eg.values():
    inferred_vectors.append(model.infer_vector(intents_eg))
    
inferred_vectors

[array([ 0.04887684,  0.05137751,  0.02112605, -0.04393798, -0.0508075 ,
        -0.01057845,  0.10617889, -0.04808602,  0.13243914, -0.0396963 ,
        -0.04897077, -0.13161062,  0.00144   ,  0.02005946, -0.0413356 ,
         0.07014607, -0.02099573,  0.09428328, -0.02107974,  0.0300785 ],
       dtype=float32)]

In [205]:
#TODO: My Doc2Vec model vector is inferring different vector values for a given sample. I think I need to treat the model in a better fashion later on  

In [49]:
intents.values()

dict_values([['tracking', 'order', 'shipment', 'late', 'status', 'carrier', 'update', 'number', 'info', 'received', 'details'], ['service'], ['quality', 'product', 'damaged', 'received', 'refund', 'return', 'issue', 'order', 'packaging', 'proper', 'working', 'expected', 'different'], ['prime', 'product', 'offer', 'price', 'sale'], ['email', 'orders', 'details', 'bank', 'access']])

In [12]:
# These are the current intents I wish to add to my training data 
print(intents) 

# Concatenate all the intent values in a sentence form 
ideal_values = list(" ".join(_words) for _words in intents.values())


{'track': ['tracking', 'order', 'shipment', 'late', 'status', 'carrier', 'update', 'number', 'info', 'received', 'details'], 'support': ['service'], 'quality': ['quality', 'product', 'damaged', 'received', 'refund', 'return', 'issue', 'order', 'packaging', 'proper', 'working', 'expected', 'different'], 'discount': ['prime', 'product', 'offer', 'price', 'sale'], 'account': ['email', 'orders', 'details', 'bank', 'access']}


I have chosen my intent value tokens based on the frequency count of words as described in the previous notebooks. If performance is not upto the mark, reiterate and improve upon them in the future version(s)

### Finding intent tags 
I want to get the tags of my representative Tweets because that's what doc2vec's `model.similarity` method takes in as paramater to generate top N Tweets similar to it.

In [206]:
#TODO: Improve this block of code later - The following code block isn't the most efficient one

In [171]:
intents.items()

dict_items([('track', ['tracking', 'order', 'shipment', 'late', 'status', 'carrier', 'update', 'number', 'info', 'received', 'details']), ('support', ['service']), ('quality', ['quality', 'product', 'damaged', 'received', 'refund', 'return', 'issue', 'order', 'packaging', 'proper', 'working', 'expected', 'different']), ('discount', ['prime', 'product', 'offer', 'price', 'sale']), ('account', ['email', 'orders', 'details', 'bank', 'access'])])

In [14]:

tokenizer = TweetTokenizer(strip_handles = True, reduce_len = True)

intents_repr = {k: tokenizer.tokenize(v) for k, v in ideal_tag_dict.items()}
print(intents_repr)

# Save them into objects 
with open("../objects/intents_repr.yml", "w") as f:
    yaml.dump(intents_repr, f, default_flow_style=False)

# Storing tags in order of the dictionary above
tags = []

# Tokenize and process inbound tweets 
tokenized_processed_inbound_extra = processed_inbound_extra.apply(tokenizer.tokenize)

{'track': ['tracking', 'order', 'shipoment', 'late', 'status', 'carrier', 'update', 'number', 'info', 'received', 'details'], 'support': ['support', 'chat', 'customer', 'resolution', 'feedback', 'satisfaction'], 'quality': ['quality', 'product', 'damaged', 'received', 'refund', 'return', 'issue', 'order', 'packaging', 'proper', 'working', 'expected', 'different'], 'discount': ['prime', 'product', 'offer', 'price', 'sale'], 'account': ['email', 'orders', 'details', 'bank', 'access']}


In [81]:
processed_df

Unnamed: 0,inbound_text,author_id,created_at,outbound_text,response_tweet_id,inbound_lang,inbound_hashtags,outbound_hashtags,clean_inbound_text,clean_outbound_text,outbound_tokens_pos,inbound_tokens_pos
0,@AmazonHelp 3 different people have given 3 di...,AmazonHelp,2017-10-31 23:28:00+00:00,@115820 We'd like to take a further look into ...,619,en,[],[],different people have given different answers ...,wed like to take a further look into this with...,"[-PRON-: NOUN, d: VERB, like: VERB, to: NOUN, ...","[different: NOUN, people: NOUN, have: NOUN, gi..."
1,Way to drop the ball on customer service @1158...,AmazonHelp,2017-10-31 22:29:00+00:00,@115820 I'm sorry we've let you down! Without ...,616,en,[],[],way to drop the ball on customer service so pi...,i am sorry we have let you down without provid...,"[i: NOUN, be: NOUN, sorry: NOUN, -PRON-: NOUN,...","[way: NOUN, to: NOUN, drop: VERB, the: NOUN, b..."
2,@115823 I want my amazon payments account CLOS...,AmazonHelp,2017-10-31 22:28:34+00:00,@115822 I am unable to affect your account via...,,en,[],[],i want my amazon payments account closed dm me...,i am unable to affect your account via twitter...,"[i: NOUN, be: NOUN, unable: NOUN, to: NOUN, af...","[i: NOUN, want: VERB, -PRON-: NOUN, amazon: NO..."
3,@AmazonHelp @115826 Yeah this is crazy we’re l...,AmazonHelp,2017-11-01 12:53:34+00:00,@115827 Thanks for your patience. ^KM,,en,[],[],yeah this is crazy were less than a week away ...,thanks for your patience km,"[thank: NOUN, for: NOUN, -PRON-: NOUN, patienc...","[yeah: NOUN, this: NOUN, be: NOUN, crazy: NOUN..."
4,@115828 How about you guys figure out my Xbox ...,AmazonHelp,2017-10-31 22:28:00+00:00,@115826 I'm sorry for the wait. You'll receive...,627,en,[],[],how about you guys figure out my xbox one x pr...,i am sorry for the wait you will receive an em...,"[i: NOUN, be: NOUN, sorry: NOUN, for: NOUN, th...","[how: NOUN, about: NOUN, -PRON-: NOUN, guy: NO..."
...,...,...,...,...,...,...,...,...,...,...,...,...
122335,@AmazonHelp I sent you guys a DM regarding the...,AmazonHelp,2017-11-22 00:17:00+00:00,@328597 We're unable to access customer accoun...,,en,[],[],i sent you guys a dm regarding the status of m...,were unable to access customer accounts via so...,"[be: NOUN, unable: NOUN, to: NOUN, access: VER...","[i: NOUN, send: VERB, -PRON-: NOUN, guy: NOUN,..."
122336,This is happening in my area w/@115821 “Prime”...,AmazonHelp,2017-11-22 02:16:55+00:00,"@777901 I'm sorry for the delay, Brenda! We st...",2987557,en,[],[],this is happening in my area w prime deliverie...,i am sorry for the delay brenda we strive to s...,"[i: NOUN, be: NOUN, sorry: NOUN, for: NOUN, th...","[this: NOUN, be: NOUN, happen: VERB, in: NOUN,..."
122337,@132994 @132995 @115850 got my #OnePlus5T at 8...,AmazonHelp,2017-11-22 03:49:29+00:00,@823783 Woohoo! That's awesome! Hope you love ...,2987674,en,"[#AmazonPrime, #OnePlus5T]",[],got my at am thanks for fulfilling the order fast,woohoo that is awesome hope you love the phone js,"[woohoo: NOUN, that: NOUN, be: NOUN, awesome: ...","[get: VERB, -PRON-: NOUN, at: NOUN, be: NOUN, ..."
122338,@115850 @132994 No exchange available for #One...,AmazonHelp,2017-11-22 05:22:31+00:00,@823802 The Exchange Offer is currently availa...,,en,[#OnePlus5T],[],no exchange available for i need to exchange m...,the exchange offer is currently available only...,"[the: NOUN, exchange: NOUN, offer: NOUN, be: N...","[no: NOUN, exchange: NOUN, available: NOUN, fo..."


In [18]:
processed_inbound_extra = pd.concat([processed_inbound_extra, pd.Series(ideal_tag_dict.values())], axis=0, ignore_index=True)

In [None]:
tokenized_processed_inbound_extra = pd.concat([tokenized_processed_inbound_extra, pd.Series(intents_repr.values())], axis=0, ignore_index=True)

In [21]:
processed_inbound_extra 

0         different people have given different answers ...
1         way to drop the ball on customer service so pi...
2         i want my amazon payments account closed dm me...
3         yeah this is crazy were less than a week away ...
4         how about you guys figure out my xbox one x pr...
                                ...                        
122340    tracking order shipoment late status carrier u...
122341    support chat customer resolution feedback sati...
122342    quality product damaged received refund return...
122343                       prime product offer price sale
122344                     email orders details bank access
Length: 122345, dtype: object

In [22]:
tokenized_processed_inbound_extra

0         [different, people, have, given, different, an...
1         [way, to, drop, the, ball, on, customer, servi...
2         [i, want, my, amazon, payments, account, close...
3         [yeah, this, is, crazy, were, less, than, a, w...
4         [how, about, you, guys, figure, out, my, xbox,...
                                ...                        
122340    [tracking, order, shipoment, late, status, car...
122341    [support, chat, customer, resolution, feedback...
122342    [quality, product, damaged, received, refund, ...
122343                 [prime, product, offer, price, sale]
122344               [email, orders, details, bank, access]
Length: 122345, dtype: object

In [79]:
# Find the index locations of specific Tweets
# get_indices = []
def report_index_loc(tweet, intent_name):
    ''' Takes in the tweet to find the index for and returns a report of that tweet index along with what the 
    representative Tweet looks like'''
    try:
        tweets = [] # List which stores tuples of indexes of representative tweets AND a boolean value to indicate if the tweet has the intent we are looking for
        for i, j in enumerate(tokenized_processed_inbound_extra):
            if j == tweet:
                tweets.append((i, True))
            else:
                tweets.append((i, False))

        indices = [i[0] for i in tweets if i[1] == True]
        # get_indices.append(indices.append(i[0]) if i[1] == True else False for i in tweets)

        preview = processed_inbound_extra.iloc[indices]

        # Appending to indexes for dictionary 
        tags.append(str(indices[0]))

    except IndexError as e:
        print("Index not in list, move on")
        return

    return intent_name, str(indices[0]), preview


# Reporting and storing indexes with the function
print("TAGGED INDEXES TO LOOK FOR")
for j, i in intents_repr.items():
    try:
        print('\n{} \nIndex: {}\nPreview: {}'.format(*report_index_loc(i, j)))
    except Exception as e:
        print("Index ended")

TAGGED INDEXES TO LOOK FOR

track 
Index: 122340
Preview: 122340    tracking order shipoment late status carrier u...
dtype: object

support 
Index: 122341
Preview: 122341    support chat customer resolution feedback sati...
dtype: object

quality 
Index: 122342
Preview: 122342    quality product damaged received refund return...
dtype: object

discount 
Index: 122343
Preview: 122343    prime product offer price sale
dtype: object

account 
Index: 122344
Preview: 122344    email orders details bank access
dtype: object


In [66]:
inferred_vector = model.infer_vector(tokenized_processed_inbound_extra[122342])
inferred_vector

array([-0.3733561 ,  0.02103804, -0.11288577,  0.10018096,  0.02359817,
        0.20761223,  0.13943799, -0.07680623,  0.8658018 , -0.05790734,
       -0.07379842, -0.91106176, -0.29489738, -0.16348544,  0.17043671,
        0.04266841, -0.37820813,  0.34814504,  0.0160022 , -0.33846483],
      dtype=float32)

In [71]:
tokenized_processed_inbound_extra[122342]

['quality',
 'product',
 'damaged',
 'received',
 'refund',
 'return',
 'issue',
 'order',
 'packaging',
 'proper',
 'working',
 'expected',
 'different']

In [70]:
# Great! Now I can get the training data for my battery intent (as an example)
similar_doc = model.wv.most_similar(inferred_vector, topn = 1000)
# Preview
similar_doc[:5]

[('tyrwhitt', 0.7233047485351562),
 ('charles', 0.7212222814559937),
 ('poundmax', 0.7111672759056091),
 ('biotique', 0.7026564478874207),
 ('honeywell', 0.6814860105514526)]

In [58]:
processed_inbound_extra[122340]

'tracking order shipoment late status carrier update number info received details'

In [54]:
tokenized_processed_inbound_extra[122340]

['tracking',
 'order',
 'shipoment',
 'late',
 'status',
 'carrier',
 'update',
 'number',
 'info',
 'received',
 'details']

In [62]:
tokenized_processed_inbound_extra

0         [different, people, have, given, different, an...
1         [way, to, drop, the, ball, on, customer, servi...
2         [i, want, my, amazon, payments, account, close...
3         [yeah, this, is, crazy, were, less, than, a, w...
4         [how, about, you, guys, figure, out, my, xbox,...
                                ...                        
122340    [tracking, order, shipoment, late, status, car...
122341    [support, chat, customer, resolution, feedback...
122342    [quality, product, damaged, received, refund, ...
122343                 [prime, product, offer, price, sale]
122344               [email, orders, details, bank, access]
Length: 122345, dtype: object

In [74]:
tags

['None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'None',
 'None',
 'None',
 'None',
 'None',
 '122340',
 '122341',
 '122342',
 '122343',
 '122344']

## Data Synthesis 

Here's is the catch. Have intent buckets inplace already in place. You would need to supply this to the below function to get your top N tweets corresponding to the current tweet


In [None]:
# Dictionary mapping the intent to the row index 
intent_itags = {
    
}

# Storing tags in order of the dictionary above 


def generate_intent(nsim, idx_tag): 
    '''Function that maps an index tag to an intent and returns nsim number of similar tweets'''
    sim_docs = model.docvecs.most_similar(idx_tag, topn = nsim)
    
    # Getting just the indexes 
    indexes = [int(i[0]) for i in sim_docs]
    
    # Actually seeing the top 1000 tweets similar to 0th tweet which seems to be about updates 
    # print(processed_inbound_extra[indexes])
    return indexes
    
# Create a dictionary mapping the intent to the row index of tweets
index_intents = {}
for intent, tag in intent_itags.items():
    print('Intent: ', intent)
    index_indents[intent] = generate_intent(1000, tag)
    print('\n')

In [None]:
# Now map the index to each row of the preprocessed inbound data
preprocessed_inbound["intent"] = processed_inbounnd.index.map(index_intents)



In [None]:
# Intent classification with Keras 
