In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd 
import numpy, textblob, string, random
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

from sklearn.externals import joblib

Using TensorFlow backend.


In [3]:
path = 'G:/My Drive/NCSU/Hackathons/HackNC2019/MyPueblo/backend/'
df = pd.read_csv(path + 'data/house_price/train.csv')

df.head()

Unnamed: 0,Description,Cost
0,Urban Solutions,1921
1,Sumitomo Mitsui Financial Group,546
2,Talquin Electric Cooperative,405
3,Big Crew Maintenance,686
4,Cavasa,1403


In [4]:
set(df['Category'].values.tolist())

KeyError: 'Category'

In [5]:
df.sample(frac=1)

Unnamed: 0,Description,Cost
4,Cavasa,1403
90,Harmony Improvements,1240
12,Black Ace Pest,1842
37,Municipal Electric Authority of Georgia (MEAG ...,1592
48,Ambassador Pest Management,1261
...,...,...
7,Palm Peach,934
57,Indiana Municipal Power Agency,1856
64,New Era Pest Control,1545
60,SunTrust Banks,1454


In [6]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['Description'], df['Category'])

In [7]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [8]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['Description'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
                vocabulary=None)

In [9]:
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [10]:
train_x

277         Low Big Home Repairs
106           Safer Pest Control
269             Handyman Matters
371             Sterling Bancorp
157                     Suresafe
                 ...            
268            Handy Home Repair
122    Emergency Pest Patrol LLC
198                      Advanta
213             Call me Handyman
245       Urban Appliance Repair
Name: Description, Length: 306, dtype: object

In [31]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df['Description'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df['Description'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [11]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(df['Description'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [20]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open(path+'/workspace/wiki-news-300d-1M.vec/wiki-news-300d-1M.vec', encoding="utf-8")):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

In [12]:
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(df['Description'])
word_index = token.word_index

In [13]:
# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

In [21]:
# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [23]:
df['char_count'] = df['Description'].apply(len)
df['word_count'] = df['Description'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['Description'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
df['title_word_count'] = df['Description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = df['Description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [25]:
valid_y

array([3, 2, 3, 1, 2, 0, 2, 3, 1, 2, 1, 1, 2, 1, 1, 0, 0, 3, 2, 3, 3, 2,
       3, 0, 3, 2, 3, 0, 0, 1, 3, 0, 2, 2, 0, 3, 3, 2, 0, 2, 0, 0, 3, 0,
       0, 2, 2, 1, 3, 3, 2, 1, 1, 0, 3, 2, 3, 1, 3, 3, 0, 0, 1, 3, 3, 3,
       3, 2, 2, 2, 0, 2, 0, 2, 0, 0, 3, 0, 2, 2, 2, 3, 3, 0, 2, 2, 3, 1,
       2, 0, 3, 0, 1, 0, 0, 3, 0, 1, 1, 0, 0, 0])

In [26]:
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

In [29]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [32]:
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.8823529411764706
NB, WordLevel TF-IDF:  0.8627450980392157
NB, N-Gram Vectors:  0.3627450980392157
NB, CharLevel Vectors:  0.8725490196078431


In [34]:
def create_rnn_gru():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the GRU Layer
    lstm_layer = layers.GRU(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rnn_gru()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print ("RNN-GRU, Word Embeddings",  accuracy)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/1
RNN-GRU, Word Embeddings 0.29411764705882354


In [35]:
NB_model = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)

In [37]:
NB_model = naive_bayes.MultinomialNB().fit(xtrain_count, train_y) 

In [42]:
NB_model_predictions = NB_model.predict(xvalid_count)
print(len(NB_model_predictions),NB_model_predictions)
print(metrics.accuracy_score(NB_model_predictions, valid_y))

102 [3 1 3 1 2 0 1 3 1 2 1 1 2 1 1 0 0 3 1 3 3 2 1 0 3 2 3 0 0 1 3 0 2 2 0 3 3
 2 0 2 0 1 3 0 0 1 2 1 3 3 2 1 1 0 3 1 3 1 3 3 0 0 1 3 3 1 3 2 2 2 0 2 0 2
 0 0 3 0 2 3 1 3 3 0 2 2 3 1 2 0 3 3 1 0 0 3 0 1 2 0 0 0]
0.8823529411764706


In [54]:
final_count =  count_vect.transform(df['Description'])
final_y =  encoder.fit_transform(df['Category'])

final_NB_model = naive_bayes.MultinomialNB().fit(final_count, final_y) 

In [55]:
NB_final_model_predictions = final_NB_model.predict(final_count)
print(metrics.accuracy_score(NB_final_model_predictions, final_y))

0.9901960784313726


In [58]:
# save the model to disk
filename = 'finalized_model.sav'
joblib.dump(final_NB_model, path+'model/'+filename)

['G:/My Drive/NCSU/Hackathons/HackNC2019/HackNC2019/mypueblo/backend/model/finalized_model.sav']