In [10]:
import torch
import pandas as pd
import numpy as np
import json, re

# Torch, Sklearn imports
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
print(torch.__version__)

## NLP libs
from nltk import download
import gensim
from nltk.corpus import stopwords
download('stopwords')

0.4.1
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rsilvei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Importing SNIPS intent dataset

In [11]:
dataset = pd.DataFrame(columns = ['phrase', 'intent'])

In [12]:
for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
               'SearchScreeningEvent']:
    with open("./2017-06-custom-intent-engines/" + intent + "/train_" + intent + ".json",
              encoding='cp1251') as data_file:
        data = json.load(data_file)
    print("Intent: {}, Length: {}".format(intent,len(data[intent])))
    texts = []
    for i in range(len(data[intent])):
        text = ''
        for j in range(len(data[intent][i]['data'])):
            text += data[intent][i]['data'][j]['text']
        dataset = dataset.append({'phrase': text, 'intent': intent}, ignore_index=True)

Intent: AddToPlaylist, Length: 300
Intent: BookRestaurant, Length: 300
Intent: GetWeather, Length: 300
Intent: PlayMusic, Length: 300
Intent: RateBook, Length: 300
Intent: SearchCreativeWork, Length: 300
Intent: SearchScreeningEvent, Length: 300


In [13]:
dataset.intent.unique()

array(['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic',
       'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent'],
      dtype=object)

In [14]:
def transformText(text, do_stop=False, do_stem=False):
    stops = set(stopwords.words("english"))
    # Convert text to lower
    text = text.lower()
    
    # Cleaning input
    text = text.replace("'s","")
    text = text.replace("’s","")
    text = text.replace("?","")
    text = text.replace("-","")
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

def strip_punctuation(s):
    return ''.join(c for c in s if c not in PUNCT)

## Lemmatization function based on Spacy Library
def lemmatizer_spacy(text):        
    sent = []
    doc = spacy_en(text)
    for word in doc:
        if word.lemma_ == "-PRON-":
            sent.append(word.text)
        else:
            sent.append(word.lemma_)
    return " ".join(sent)

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

In [15]:
dataset['preproc_text'] = dataset['phrase'].apply(lambda x: transformText(x))

In [16]:
dataset

Unnamed: 0,phrase,intent,preproc_text
0,"add Stani, stani Ibar vodo songs in my playlis...",AddToPlaylist,add stani stani ibar vodo songs in my playlist...
1,add this album to my Blues playlist,AddToPlaylist,add this album to my blues playlist
2,Add the tune to the Rage Radio playlist.,AddToPlaylist,add the tune to the rage radio playlist
3,Add WC Handy to my Sax and the City playlist,AddToPlaylist,add wc handy to my sax and the city playlist
4,Add BSlade to women of k-pop playlist,AddToPlaylist,add bslade to women of kpop playlist
5,add the current tune to my Rock Gaming playlist,AddToPlaylist,add the current tune to my rock gaming playlist
6,add villotta to The MetalSucks Playlist playlist,AddToPlaylist,add villotta to the metalsucks playlist playlist
7,Add Bronislau Kaper to the drive playlist.,AddToPlaylist,add bronislau kaper to the drive playlist
8,add the artist joseph meyer to my mad cool fes...,AddToPlaylist,add the artist joseph meyer to my mad cool fes...
9,Add the avispa track to my Bass Gaming playlist.,AddToPlaylist,add the avispa track to my bass gaming playlist
