In [1]:
# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Other
import re
import string
import numpy as np
import pandas as pd
import unicodedata
import spacy

from sklearn.manifold import TSNE
from contractions import CONTRACTION_MAP

#nlp = spacy.load('en_core', parse = True, tag=True, entity=True)
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, entity=True)
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#read the train and test file
train = pd.read_csv("train.csv",sep='~')
test = pd.read_csv("test.csv",sep='~')

In [3]:
#Encode the target variable
train = train[train['Is_Response'].notnull()]
stars_dict = {'Good':1,'Bad':0}
train["Is_Response"] = train['Is_Response'].replace(stars_dict,regex=True)
labels = train["Is_Response"]

In [4]:
#Remove ascented word
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [5]:
#Expand the contradictions
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [6]:
#Remove special character
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [7]:
train = train[train['Description'].notnull()]


In [8]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [9]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

Build a Text Normalizer

In [10]:
def normalize_corpus(corpus, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:

        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
            doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [11]:
train['Description'] = normalize_corpus(train['Description'])
test['Description'] = normalize_corpus(test['Description'])

In [27]:
#Tokenization process
vocabulary_size = 50000
maxlen = 160
tokenizer = Tokenizer(num_words= vocabulary_size)
temp = pd.DataFrame()
temp['Description'] = train['Description']
temp = temp.append(test)
tokenizer.fit_on_texts(temp['Description'])

sequences = tokenizer.texts_to_sequences(train['Description'])
data = pad_sequences(sequences, maxlen=maxlen)
sequences = tokenizer.texts_to_sequences(test['Description'])
test_tokenize = pad_sequences(sequences, maxlen=maxlen)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


In [28]:
print(temp.shape)

(38931, 4)


Build neural network with LSTM
Network Architechture
The network starts with an embedding layer. The layer lets the system expand each token to a more massive vector, allowing the network to represent a word in a meaningful way. The layer takes 50000 as the first argument, which is the size of our vocabulary, and 100 as the second input parameter, which is the dimension of the embeddings. The third parameter is the input_length of 50, which is the length of each comment sequence.

In [29]:
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(vocabulary_size, 100, input_length=maxlen))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv

In [30]:
model_conv = create_conv_model()
model_conv.fit(data, np.array(labels), validation_split=0.2, epochs = 3)

Train on 24136 samples, validate on 6035 samples
Epoch 1/3








Epoch 2/3








Epoch 3/3










<keras.callbacks.History at 0x236aa0a86a0>

In [36]:
predictions = model_conv.predict_classes(test_tokenize)

In [39]:
out_df = pd.DataFrame({"User_ID":test["User_ID"].values})
out_df['Is_Response'] = predictions
stars_dict = {1:'Good',0:'Bad'}
out_df["Is_Response"] = out_df['Is_Response'].replace(stars_dict,regex=True)
out_df.to_csv("submission_LSTM.csv", index=False)
out_df.head()

Unnamed: 0,User_ID,Is_Response
0,9602,Good
1,8749,Good
2,15500,Good
3,5495,Good
4,18570,Good
