In [1]:
from collections import Counter
from datetime import datetime
import json

from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from IntegratedGradients import * 
from keras.models import load_model
import pickle
import numpy as np
from matplotlib import pyplot as plt
# caleb_path = '/Users/CalebKaijiLu/Documents/yelp_dataset/'
caleb_path = '/home/caleb/schoolwork/yelp_dataset/'
selva_path = ''

Using TensorFlow backend.


### Only Run this on Server. If test trained models, use Gender_Prediction_Test.ipynb

In [2]:
def train_model(balanced_texts,balanced_labels,limit):
    tokenizer = Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(balanced_texts)
    sequences = tokenizer.texts_to_sequences(balanced_texts)
    data = pad_sequences(sequences, maxlen=limit)

    model = Sequential()
    model.add(Embedding(20000, 128, input_length=limit))
    model.add(Dropout(0.2))
    model.add(Conv1D(64, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(LSTM(128,dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(data, np.array(balanced_labels), validation_split=0.5, epochs=3,batch_size = 64)
    return tokenizer,model

In [3]:
t1 = datetime.now()
with open(caleb_path + "review.json") as f:
    reviews = f.read().strip().split("\n")
# reviews = reviews[:10000]
reviews = [json.loads(review) for review in reviews]
print(datetime.now() - t1)

0:01:08.541012


In [5]:
with open(caleb_path + "user_gender.json") as f:
    user_genders = f.read().strip().split("\n")
user_gender = [json.loads(user_gender) for user_gender in user_genders]
user_id_gender = {user['user_id']:user['gender'] for user in user_gender}
print(user_gender[0]['user_id'])


om5ZiponkpRqUNa3pVPiRg


In [9]:
reviews_gender = [review for review in reviews if review['user_id'] in user_id_gender]
genders = [user_id_gender[review_gender['user_id']] for review_gender in reviews_gender]
bin_gender = [0 if gender =='male' else 1 for gender in genders]
text_gender = [review_gender['text'] for review_gender in reviews_gender]

In [16]:
from collections import defaultdict
user_dict = defaultdict(str);
for review_gender in reviews_gender:
    user_dict[review_gender['user_id']] += review_gender['text']
    

In [17]:
text_gender_user = []
bin_gender_user = []
for user in user_dict:
    if len(user_dict[user])>3000:
        text_gender_user.append(user_dict[user])
        bin_gender_user.append(0 if user_id_gender[user]=='male' else 1)
    

In [18]:
text_lengths = [len(text) for text in text_gender_user]
print(sum(text_lengths)/float(len(text_lengths)))
print(len(text_gender_user),len(bin_gender_user))
print(text_gender_user[0],bin_gender_user[0])


12509.800645213982
122440 122440
If you need an inexpensive place to stay for a night or two then you may consider this place but for a longer stay I'd recommend somewhere with better amenities. 

Pros:
Great location- you're right by the train station, central location to get to old town and new town, and right by sight seeing his tours. Food, bars, and shopping all within walking distance. Location, location, location.
Very clean and very good maid service

Cons:
Tiny rooms 
Uncomfortable bed 
Absolutely no amenities 
No phone in room 
No wardrobe 

Was given a lot of attitude about me and my husband sharing a room which was quite strange and we were charged 15 pounds more for double occupancy not sure why that matters I felt like it was a money grab. It was just handled in a kind of odd manner to me... 

If you book this hotel all you get is a bed, desk, and a bathroom. It isn't awful but know what you're getting into.Had an excellent lunch here. I shared the meat board and a large 

In [60]:
balanced_texts_gender = []
balanced_gender = []
limit = 10000  #Change this to grow/shrink the dataset
neg_pos_counts = [0, 0]
for i in range(len(text_gender)):
    polarity = bin_gender[i]
    if neg_pos_counts[polarity]<limit:
        balanced_texts_gender.append(text_gender[i])
        balanced_gender.append(bin_gender[i])
        neg_pos_counts[polarity]+=1
Counter(balanced_gender)

Counter({0: 10000, 1: 10000})

In [28]:
with open("tokenizer_gender_single_server.pickle", "rb") as f:
       tokenizer = pickle.load(f)

model = load_model("yelp_gender_single_model_server.hdf5")

In [23]:
balanced_texts_gender_user = []
balanced_gender_user = []
limit = 40000  #Change this to grow/shrink the dataset
neg_pos_counts = [0, 0]
for i in range(len(text_gender_user)):
    polarity = bin_gender[i]
    if neg_pos_counts[polarity]<limit:
        balanced_texts_gender_user.append(text_gender_user[i])
        balanced_gender_user.append(bin_gender_user[i])
        neg_pos_counts[polarity]+=1
Counter(balanced_gender_user)

Counter({0: 50000, 1: 50000})

## Find the most probable sentence

In [None]:
sequences = tokenizer.texts_to_sequences(balanced_texts_gender)
data = pad_sequences(sequences, maxlen=300)

# get predictions for each of your new texts
predictions = np.array(model.predict(data)).T[0]
predictions_bi = np.array([1 if p<0.5 else 0 for p in predictions])
truth = np.array(balanced_gender)

print(np.sum(np.abs(predictions_bi-truth))/20000.0)


idx_scores = np.argsort(predictions)[:100]
for idx_score in idx_scores:
    if 'wife' not in balanced_texts_gender[idx_score]: 
        print(balanced_texts_gender[idx_score],truth[idx_score],predictions[idx_score])

# Train the models

In [None]:
tokenizer_single, model_single = train_model(balanced_texts_gender,balanced_gender,300)

In [24]:
tokenizer_user, model_user = train_model(balanced_texts_gender_user,balanced_gender_user,3000)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 3000, 128)         2560000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 3000, 128)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2996, 64)          41024     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 749, 64)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 2,699,969
Trainable params: 2,699,969
Non-trainable params: 0
_________________________________________________________________


In [None]:
with open("tokenizer_gender_single_server.pickle", "wb") as f:
        pickle.dump(tokenizer_single, f)
model_single.save("yelp_gender_single_model_server.hdf5")

In [25]:
with open("tokenizer_gender_user_server.pickle", "wb") as f:
        pickle.dump(tokenizer_user, f)
model_user.save("yelp_gender_user_model_server.hdf5")