In [1]:
import pandas as pd
import re
from sklearn.preprocessing import scale
from nltk.corpus import wordnet
import sys
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import gensim
from gensim.models import Word2Vec
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
import multiprocessing
from keras.models import Sequential, Model 
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

Using TensorFlow backend.


In [2]:
# !{sys.executable} -m pip install wordcloud

In [3]:
# !{sys.executable} -m pip install nltk

In [4]:
# nltk.download()

In [5]:
# !conda install --yes --prefix {sys.prefix} gensim

In [6]:
# read data from file
def read_file(encode):
    with open(r"ra_data_classifier.csv", encoding = encode, errors='ignore', newline='') as f:
        df = pd.read_csv(f)
        df = df.drop(columns=['hid'])
    return df

In [7]:
# data preprocessing
def clean_text(text):
    """
    pre-processing on the given text.
    
    Steps:
    - Removing punctuation
    - Lowering text
    """
    # remove leading and trailing whitespaces, newline and tab characters
    # convert text to lowercase
    text = text.strip().lower()
    
    # remove the chracters [\], ['], [+], ["]
    text = re.sub(r"\+", "", text)
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r"\"", "", text)
    
    # replace email address with "emailadd"
    text = re.sub('\S+@\S+', "emailadd ", text)
    
    # replace "$numeric / month", "$numeric/month", "$numeric/m", "$numeric / m" to rentfee 
    p = re.compile(r"(?:\$\d+\/)month|(?:\$\d+\s+\/\s+)month|(?:\$\d+\s+\/\s+)m|(?:\$\d+\/)m|(?:\S\d+)|(?:\$\d+\/)mo|(?:\$\d+\s+\/\s+)mo")
    text = p.sub("rentfee ", text)
                 
    # replace phone number with "phonenumber"
    text = re.sub(r"\(?\d{3}\)?[-.\s]\d{3}[-.\s]\d{4}", "phonenumber", text)

    # remove non-alphabet characters
    text = re.sub('[^a-z]+', ' ', text)
    
    return text  

In [8]:
# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [9]:
# tokenize the sentence and lemmatize the words
def tokenization(df):
    # grab all cleaned_chunk
    tokens = []
    df['token'] = df['clean_chunk']
    
    # tokenize the string into words
    for c in df['clean_chunk']:
        token = word_tokenize(c)
        tokens.append(token)
        
    # remove none-alphabetic tokens, such as punctuation
    for token in tokens:
        token = [word for word in token if word.isalpha()]
    
    # filter out stopwords
    stop_words = set(stopwords.words('english'))
    
    for token in tokens:
        token = [word for word in token if not word in stop_words]
        
    # lemmatization
    lemmatizer = WordNetLemmatizer()
    for i in range(len(tokens)):
        temp = []
        for j in range(len(tokens[i])):
            tokens[i][j] = lemmatizer.lemmatize(tokens[i][j], get_wordnet_pos(tokens[i][j]))
            
    return tokens

In [10]:
# WORD2VEC_CBOW()
def word2vec(tokened_data, cbow):
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
                                                                # 
    cores = multiprocessing.cpu_count() # Count the number of cores in a computer, important for a parameter of the model
    if cbow == 1: # use CBOW
        w2v_model = Word2Vec(min_count=20,
                             window=2,
                             size=300,
                             sample=6e-5, 
                             alpha=0.03, 
                             min_alpha=0.0007, 
                             negative=20,
                             workers=cores-1)
    else: # use skipgram
        w2v_model = Word2Vec(min_count=20,
                             window=2,
                             size=300,
                             sample=6e-5, 
                             alpha=0.03, 
                             min_alpha=0.0007, 
                             negative=20,
                             workers=cores-1,
                             sg=1)
    # BUILD_VOCAB()
    w2v_model.build_vocab(tokened_data, progress_per=1000)

    # TRAIN()
    w2v_model.train(tokened_data, total_examples=w2v_model.corpus_count, epochs=10000, report_delay=1)
    return w2v_model

In [11]:
def plot_word_cloud(terms):
    text = terms.index
    text = ' '.join(list(text))
    # lower max_font_size
    wordcloud = WordCloud(max_font_size=40).generate(text)
    plt.figure(figsize=(25, 25))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [12]:
def buildWordVector(tokens, size, w2v_model, tfidf):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += w2v_model[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [13]:
# Neural network
from keras.models import Sequential, Model 
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input, Embedding
def get_model(trainX, trainY):
    model = Sequential()

    model.add(Dense(128, activation='relu', input_dim=300))
    model.add(Dropout(0.7)) # to prevent overfitting
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adadelta',loss='binary_crossentropy',metrics=['accuracy'])
    model.fit(trainX, trainY, epochs=30, batch_size=50, verbose=0)
    return model

In [14]:
# ENCODING = "latin-1"
# ENCODING = "ISO-8859-1"
ENCODING = "utf-8"

#read file
data = read_file(ENCODING)
data.head()

Unnamed: 0,chunk,has_space
0,"Landmark Center, 8th Fl",0
1,Contact: The C3 team at MakemeC3@cic.us -- Add...,0
2,"A powerful tool for developers, the MySQL Data...",0
3,"Easy access to T, Hubway, and parking",0
4,Check out our Private Offices,1


In [15]:
# data cleaning
cleaned_chunk = []
num_row = data.shape[0]
for i in range(num_row):
    cleaned_chunk.append(clean_text(data['chunk'][i]))

# add clean_chunk column to dataframe        
data['clean_chunk'] = pd.DataFrame(cleaned_chunk)

# tokenize the cleaned chunk
tokenized_chunk = tokenization(data)

# add tokenized chunk column to the data
for c in range(len(tokenized_chunk)):
    data['token'][c] = tokenized_chunk[c]
data[:10]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Unnamed: 0,chunk,has_space,clean_chunk,token
0,"Landmark Center, 8th Fl",0,landmark center th fl,"[landmark, center, th, fl]"
1,Contact: The C3 team at MakemeC3@cic.us -- Add...,0,contact the rentfee team at emailadd additiona...,"[contact, the, rentfee, team, at, emailadd, ad..."
2,"A powerful tool for developers, the MySQL Data...",0,a powerful tool for developers the mysql datab...,"[a, powerful, tool, for, developer, the, mysql..."
3,"Easy access to T, Hubway, and parking",0,easy access to t hubway and parking,"[easy, access, to, t, hubway, and, parking]"
4,Check out our Private Offices,1,check out our private offices,"[check, out, our, private, office]"
5,\r\r\r\r \r By Michael Carne...,0,by michael carney written on june rentfee rent...,"[by, michael, carney, write, on, june, rentfee..."
6,\r\t\t\t\t\t\t\t\t\t\tWorkbar۪s coworking spac...,1,workbar s coworking spaces provide the right b...,"[workbar, s, coworking, space, provide, the, r..."
7,"۝We went from 3,000 sq ft to 13,000 sq ft. Tha...",0,we went from rentfee sq ft to rentfee rentfee...,"[we, go, from, rentfee, sq, ft, to, rentfee, r..."
8,"Common space / kitchen, available for use day ...",0,common space kitchen available for use day and...,"[common, space, kitchen, available, for, use, ..."
9,Workbar Union $350 / month full-time open wor...,1,workbar union rentfee full time open workspace...,"[workbar, union, rentfee, full, time, open, wo..."


In [16]:
# defining the x (input), and the y (output)
y = data['has_space'].values
x = np.array(data["token"])

# split train(80%), test set(20%)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [17]:
# vectorize the data
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))
# plot word cloud
tfidf2 = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf2.columns = ['tfidf']
# print("The most frequent words in the file are below.\n")
# plot_word_cloud(tfidf2.sort_values(by=['tfidf'], ascending=True).head(100))

vocab size : 29


## word2vec model with CBOW

In [18]:
# create word2vec model
w2v_model_cbow = word2vec(data['token'], 1)

In [19]:
# generate word2vec train and test data
train_vecs_w2v = np.concatenate([buildWordVector(z, 300, w2v_model_cbow, tfidf) for z in map(lambda x: x, x_train)])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, 300, w2v_model_cbow, tfidf) for z in map(lambda x: x, x_test)])
test_vecs_w2v = scale(test_vecs_w2v)

print ('shape for training set : ',train_vecs_w2v.shape,
      '\nshape for test set : ', test_vecs_w2v.shape)

shape for training set :  (80, 300) 
shape for test set :  (20, 300)


  
  


### Neural Network

In [20]:
# fit model
model = get_model(train_vecs_w2v, y_train)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               38528     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 38,657
Trainable params: 38,657
Non-trainable params: 0
_________________________________________________________________


In [21]:
# predict probabilities for test set
test_probs = model.predict(train_vecs_w2v, verbose=0)
test_classes = model.predict_classes(test_vecs_w2v, verbose=0)

In [22]:
# reduce to 1d array
test_probs = test_probs[:,0]
test_classes = test_classes[:, 0]

In [23]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, test_classes)
print('Accuracy: %f' % accuracy)
# precision: tp / (tp + fp)
precision = precision_score(y_test, test_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, test_classes)
print('Recall: %f' % recall)
# f1: 2 * (precision * recall) / (precision + recall)
f1 = f1_score(y_test, test_classes)
print('F1 score: %f' % f1)
# confuson matrix
matrix = confusion_matrix(y_test, test_classes)
tn, fp, fn, tp = matrix.ravel()
print('confusion matrix: \n', matrix)
print("true negative : ", tn, "\nfalse positive : ", fp ,"\nfalse negative : ",  fn, "\ntrue positive : ", tp)

Accuracy: 0.850000
Precision: 0.750000
Recall: 0.857143
F1 score: 0.800000
confusion matrix: 
 [[11  2]
 [ 1  6]]
true negative :  11 
false positive :  2 
false negative :  1 
true positive :  6


## word2vec model with skipgram

In [30]:
# create word2vec model
w2v_model_sg = word2vec(data['token'], 0)

In [31]:
# generate word2vec train and test data
train_vecs_w2v_sg = np.concatenate([buildWordVector(z, 300, w2v_model_sg, tfidf) for z in map(lambda x: x, x_train)])
train_vecs_w2v_sg = scale(train_vecs_w2v)

test_vecs_w2v_sg = np.concatenate([buildWordVector(z, 300, w2v_model_sg, tfidf) for z in map(lambda x: x, x_test)])
test_vecs_w2v_sg = scale(test_vecs_w2v)

print ('shape for training set : ',train_vecs_w2v_sg.shape,
      '\nshape for test set : ', test_vecs_w2v_sg.shape)

# fit model
model_sg = get_model(train_vecs_w2v_sg, y_train)

# predict probabilities for test set
test_probs_sg = model_sg.predict(train_vecs_w2v_sg, verbose=0)
test_classes_sg = model_sg.predict_classes(test_vecs_w2v_sg, verbose=0)

# reduce to 1d array
test_probs_sg = test_probs_sg[:,0]
test_classes_sg = test_classes_sg[:, 0]

  
  


shape for training set :  (80, 300) 
shape for test set :  (20, 300)


In [32]:
# accuracy: (tp + tn) / (p + n)
accuracy_sg = accuracy_score(y_test, test_classes_sg)
print('Accuracy: %f' % accuracy_sg)
# precision: tp / (tp + fp)
precision_sg = precision_score(y_test, test_classes_sg)
print('Precision: %f' % precision_sg)
# recall: tp / (tp + fn)
recall_sg = recall_score(y_test, test_classes_sg)
print('Recall: %f' % recall_sg)
# f1: 2 * (precision * recall) / (precision + recall)
f1_sg = f1_score(y_test, test_classes_sg)
print('F1 score: %f' % f1_sg)
# confuson matrix
matrix_sg = confusion_matrix(y_test, test_classes_sg)
tn_sg, fp_sg, fn_sg, tp_sg = matrix_sg.ravel()
print('confusion matrix: \n', matrix_sg)
print("true negative : ", tn_sg, "\nfalse positive : ", fp_sg ,"\nfalse negative : ",  fn_sg, "\ntrue positive : ", tp_sg)

Accuracy: 0.850000
Precision: 0.750000
Recall: 0.857143
F1 score: 0.800000
confusion matrix: 
 [[11  2]
 [ 1  6]]
true negative :  11 
false positive :  2 
false negative :  1 
true positive :  6


## compare the CBOW model and Skipgram model

In [33]:
compare_result = pd.DataFrame({'CBOW': [accuracy, f1, precision, recall], 'Skipgram': [accuracy_sg, f1_sg, precision_sg, recall_sg]})

In [34]:
compare_result.rename(index={0:'accurcy',1:'F1 score', 2:'precision', 3:'recall'}, inplace=True)
compare_result

Unnamed: 0,CBOW,Skipgram
accurcy,0.85,0.85
F1 score,0.8,0.8
precision,0.75,0.75
recall,0.857143,0.857143


In [35]:
validation_dict = {'tokenized chunk': x_test, 'has_space': y_test, 'predicted has_space_CBOW': test_classes, 'predicted has_space_skipgram' : test_classes_sg}
validation_data = pd.DataFrame(validation_dict)
validation_data

Unnamed: 0,tokenized chunk,has_space,predicted has_space_CBOW,predicted has_space_skipgram
0,"[prestigious, kendall, square, business, addre...",0,0,0
1,"[let, s, grow, our, business, together, a, a, ...",0,0,0
2,"[a, powerful, tool, for, developer, the, mysql...",0,0,0
3,"[on, site, gym, come, soon]",0,0,0
4,"[workshop, brookline, rentfee, full, time, mem...",1,1,1
5,"[if, you, would, like, to, host, an, event, in...",0,1,1
6,"[learnlaunch, campus, unlisted, pricing, for, ...",1,1,1
7,"[the, majority, of, our, host, location, be, i...",1,1,1
8,"[locate, in, a, historic, brick, and, beam, bu...",1,1,1
9,"[cambridge, coworking, community, rentfee]",0,0,0
