#“In God we trust. All others must bring data.” – W. Edwards Deming, statistician

# Power of Word2Vec

In [None]:
from gensim.models.word2vec import Word2Vec

In [None]:
model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# king - man + woman 
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

In [None]:
# biggest - big + small 
model.most_similar(positive=['biggest','small'], negative=['big'], topn=5)

#Reading blog post from directory

In [None]:
import os
import pickle


In [None]:
DATA_DIRECTORY = os.path.join('data')
print DATA_DIRECTORY

In [None]:
male_post_list= []
female_post_list= []

In [None]:
with open(os.path.join(DATA_DIRECTORY,"male_blog_list.txt"),"rb") as male_file:
    male_post_list= pickle.load(male_file)
    print len(male_post_list)
with open(os.path.join(DATA_DIRECTORY,"female_blog_list.txt"),"rb") as female_file:
    female_post_list = pickle.load(female_file)
    
    


    

            

In [None]:
print len(female_post_list),female_post_list[1]
print len(male_post_list),male_post_list[1]



In [None]:
print len(female_post_list),len(male_post_list)

In [None]:
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

In [None]:
clean_male_post_list = []
clean_female_post_list = []

for post_male in male_post_list:
    if len(post_male) == 0:
        continue
    clean_male_post_list.append(post_male)

for post_female in female_post_list:
    if len(post_female) == 0:
        continue
    clean_female_post_list.append(post_female)

In [None]:
print len(clean_male_post_list),len(clean_female_post_list)

In [None]:
for post in clean_male_post_list:
    if len(post) == 0:
        print "empty"

for post in clean_female_post_list:
    if len(post) == 0:
        print "empty"
        
        

# We have the input data, building data model

In [None]:
import numpy as np

In [None]:
# 0 for male, 1 for female
concatenate_array = np.concatenate((np.zeros(len(clean_male_post_list)),np.ones(len(clean_female_post_list))))

In [None]:
len(concatenate_array)

In [None]:
from sklearn.cross_validation import train_test_split
x_train,x_test,male_female_train,male_female_test = train_test_split(np.concatenate((clean_male_post_list,clean_female_post_list)),concatenate_array,test_size=0.2)

In [None]:
x_train.shape[0],male_female_train.shape[0],x_train

In [None]:
import gensim
LabeledSentence = gensim.models.doc2vec.LabeledSentence

In [None]:
def labelizeReviews(reviews,label_type):
    labelized = []
    for i,v in enumerate(reviews):
        if len(v) == 0:
            continue
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v,[label]))
    return labelized

In [None]:
x_train_label = labelizeReviews(x_train,'TRAIN')
x_test_label = labelizeReviews(x_test,'TEST')

print len(x_train_label),len(x_train)

#We have labelized reviews, now building DBOW and DM models

In [None]:
import random

In [None]:
size = 300

In [None]:
#dm defines the training algorithm. By default (dm=1), distributed memory is used. Otherwise, dbow is employed.

#size is the dimensionality of the feature vectors.

#window is the maximum distance between the current and predicted word within a sentence.

#alpha is the initial learning rate (will linearly drop to zero as training progresses).

#seed = for the random number generator.

#min_count = ignore all words with total frequency lower than this.

#sample = threshold for configuring which higher-frequency words are randomly downsampled;
#default is 0 (off), useful value is 1e-5.
#workers = use this many worker threads to train the model (=faster training with multicore machines).

#hs = if 1 (default), hierarchical sampling will be used for model training (else set to 0).

#negative = if > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20).

#dm_mean = if 0 (default), use the sum of the context word vectors. If 1, use the mean. Only applies when dm is used.

model_dm = gensim.models.Doc2Vec(min_count=1,window=10,size=size,sample=1e-3,negative=5,workers=20)
model_dbow = gensim.models.Doc2Vec(min_count=1,window=10,size=size,sample=1e-3,negative=5,workers=20,dm=0)

In [None]:
model_dm.build_vocab(np.concatenate((x_train_label,x_test_label)))
model_dbow.build_vocab(np.concatenate((x_train_label,x_test_label)))

In [None]:
x_train_label_np = np.array(x_train_label)

In [None]:
x_train_label_np.shape

In [None]:
for epoch in range(10):
    perm = np.random.permutation(x_train_label_np.shape[0])
    model_dm.train(x_train_label_np[perm])
    model_dbow.train(x_train_label_np[perm])

In [None]:
def getVecs(model,corpus,size):
    vecs = [np.array(model[z.labels[0]]).reshape((1,size)) for z in corpus]
    return np.concatenate(vecs)

In [None]:
train_vecs_dm = getVecs(model_dm,x_train_label_np,size)
train_vecs_dbow = getVecs(model_dbow,x_train_label_np,size)

In [None]:
train_vecs = np.hstack((train_vecs_dm,train_vecs_dbow))

In [None]:
train_vecs.shape

In [None]:
x_test_label_np = np.array(x_test_label)

In [None]:
for epoch in range(10):
    perm = np.random.permutation(x_test_label_np.shape[0])
    model_dm.train(x_test_label_np[perm])
    model_dbow.train(x_test_label_np[perm])
    

In [None]:
test_vecs_dm = getVecs(model_dm,x_test_label_np,size)
test_vecs_dbow = getVecs(model_dbow,x_test_label_np,size)

In [None]:
test_vecs = np.hstack((test_vecs_dm,test_vecs_dbow))
print test_vecs_dm.shape,test_vecs_dbow.shape,male_female_train.shape,male_female_test.shape

#We have all the vectors now, we have to train the classifier

In [None]:
from sklearn.linear_model import SGDClassifier


In [None]:
lrl1 = SGDClassifier(loss='log',penalty='l1')
lrl2 = SGDClassifier(loss='log',penalty='l2')
print train_vecs.shape[0],male_female_train.shape[0]


In [None]:
lrl1.fit(train_vecs,male_female_train)


In [None]:
print 'Test Accuracy : %.2f' %lrl1.score(test_vecs,male_female_test)

In [None]:
lrl2.fit(train_vecs,male_female_train)

In [None]:
print 'Test Accuracy : %.2f' %lrl2.score(test_vecs,male_female_test)

# 5 fold cross validation

In [None]:
from sklearn.cross_validation import KFold
from sklearn import metrics
import pandas as pd

In [None]:
sgd_l1_kf = KFold(n=train_vecs.shape[0],n_folds=5,shuffle=True)

In [None]:
sgd_l1_kf

In [None]:
trained_vecs_df = pd.DataFrame(train_vecs)
target_np = np.array(male_female_train)



In [None]:
trained_vecs_df.head()

In [None]:
trained_vecs_df.shape

In [None]:
sgd_l1_kf

In [None]:
sgd_l1_metrics = []
for train_index, validate_index in sgd_l1_kf:
    sample_train,sample_validate = trained_vecs_df.loc[train_index],trained_vecs_df.loc[validate_index]
    
    sample_train_target,sample_validate_target = male_female_train[train_index],male_female_train[validate_index]
    
    #print sample_train.shape,sample_validate.shape,sample_train_target.shape,sample_validate_target.shape
    
    sgd_l1 = SGDClassifier(loss='log',penalty='l1')
    
    sgd_l1.fit(sample_train,sample_train_target)
    
    sgd_l1_predicted = sgd_l1.predict(sample_validate)
    
    sgd_l1_predicted_copy = sgd_l1_predicted.copy()
    
    sgd_l1_predicted[sgd_l1_predicted > 0.5] = 1
    sgd_l1_predicted[sgd_l1_predicted <= 0.5] = 0
  
    
    sgd_l1_analysis = pd.concat([pd.Series(sample_validate_target),pd.Series(sgd_l1_predicted)],axis=1)

    sgd_l1_analysis.columns = ['actual','prediction']
    
    sgd_l1_auc = metrics.roc_auc_score(sgd_l1_analysis.actual,sgd_l1_analysis.prediction)
        
    sgd_l1_metrics.append((sgd_l1_auc))
    

In [None]:
sgd_l1_metrics_df = pd.DataFrame(sgd_l1_metrics).mean()

In [None]:
import matplotlib
from matplotlib import pyplot

In [None]:
%matplotlib inline

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(sgd_l1_analysis.actual, sgd_l1_predicted_copy)
pyplot.plot(fpr, tpr)
pyplot.plot([0,1],[0,1])


# RNN using LSTM 
       




In [None]:
from keras.optimizers import SGD
from keras.preprocessing.text import one_hot,text_to_word_sequence,base_filter
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence

In [None]:
# text processing - one hot builds index of the words
male_one_hot = []
female_one_hot = []
n = 30000
for post in clean_male_post_list:
    try:
        male_one_hot.append(one_hot(post,n,split=" ",filters=base_filter(),lower=True))
    except:
        continue

for post in clean_female_post_list:
    try:
        female_one_hot.append(one_hot(post,n,split=" ",filters=base_filter(),lower=True))
    except:
        continue

In [None]:
# 0 for male, 1 for female
concatenate_array_rnn = np.concatenate((np.zeros(len(male_one_hot)),np.ones(len(female_one_hot))))

In [None]:
x_train_rnn,x_test_rnn,y_train_rnn,y_test_rnn = train_test_split(np.concatenate((female_one_hot,male_one_hot)),concatenate_array_rnn,test_size=0.2)

In [None]:
maxlen = 100
x_train_rnn = sequence.pad_sequences(x_train_rnn,maxlen=maxlen)
x_test_rnn = sequence.pad_sequences(x_test_rnn,maxlen=maxlen)
print('x_train_rnn shape:', x_train_rnn.shape,y_train_rnn.shape)
print('x_test_rnn shape:', x_test_rnn.shape,y_test_rnn.shape)

In [None]:
max_features = 30000
dimension = 128
input_dimension = 128
output_dimension = 128
model = Sequential()
model.add(Embedding(max_features, dimension))
model.add(LSTM(input_dimension, output_dimension))
model.add(Dropout(0.5))
model.add(Dense(128, 1))
model.add(Activation('sigmoid'))

In [None]:
model.compile(loss='mean_squared_error',optimizer='sgd')

In [None]:
model.fit(x_train_rnn,y_train_rnn,batch_size=32,nb_epoch=4,validation_data=(x_test_rnn,y_test_rnn),show_accuracy=True)

In [None]:
score,acc = model.evaluate(x_test_rnn,y_test_rnn,batch_size=32,show_accuracy=True)

# Using TFIDF Vectorizer as an input instead of one hot

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
vectorizer = TfidfVectorizer(decode_error='ignore', norm='l2')
tfidf_male = vectorizer.fit_transform(clean_male_post_list)
tfidf_female = vectorizer.fit_transform(clean_female_post_list)

In [None]:
flattened_array_tfidf_male = tfidf_male.toarray()
flattened_array_tfidf_female = tfidf_male.toarray()

In [None]:
concatenate_array_rnn = np.concatenate((np.zeros(len(flattened_array_tfidf_male)),np.ones(len(flattened_array_tfidf_female))))

In [None]:
x_train_rnn,x_test_rnn,y_train_rnn,y_test_rnn = train_test_split(np.concatenate((flattened_array_tfidf_male,flattened_array_tfidf_female)),concatenate_array_rnn,test_size=0.2)

In [None]:
maxlen = 100
# x_train_rnn = sequence.pad_sequences(x_train_rnn,maxlen=maxlen)
# x_test_rnn = sequence.pad_sequences(x_test_rnn,maxlen=maxlen)
# print('x_train_rnn shape:', x_train_rnn.shape,y_train_rnn.shape)
# print('x_test_rnn shape:', x_test_rnn.shape,y_test_rnn.shape)

In [None]:
max_features = 30000
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, 128))
model.add(Dropout(0.5))
model.add(Dense(128, 1))
model.add(Activation('sigmoid'))

In [None]:
model.compile(loss='mean_squared_error',optimizer='sgd')

In [None]:
model.fit(x_train_rnn,y_train_rnn,batch_size=32,nb_epoch=4,validation_data=(x_test_rnn,y_test_rnn),show_accuracy=True)

In [None]:
score,acc = model.evaluate(x_test_rnn,y_test_rnn,batch_size=32,show_accuracy=True)

# Sentence Generation using RNN(LSTM)

In [None]:
# reading all the male text data into one string
male_post = ' '.join(clean_male_post_list[:2])

#building character set for the male posts
character_set_male = set(male_post)
#building two indices - character index and index of character
char_indices = dict((c, i) for i, c in enumerate(character_set_male))
indices_char = dict((i, c) for i, c in enumerate(character_set_male))


# cut the text in semi-redundant sequences of maxlen characters
maxlen = 20
step = 1
sentences = []
next_chars = []
for i in range(0, len(male_post) - maxlen, step):
    sentences.append(male_post[i : i + maxlen])
    next_chars.append(male_post[i + maxlen])


In [None]:
#Vectorisation of input
x_male = np.zeros((len(male_post),maxlen,len(character_set_male)),dtype=np.bool)
y_male = np.zeros((len(male_post),len(character_set_male)),dtype=np.bool)

print x_male.shape,y_male.shape

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x_male[i, t, char_indices[char]] = 1
    y_male[i, char_indices[next_chars[i]]] = 1

print x_male.shape,y_male.shape

In [None]:

#Building the model to generate text with 2 layers
auto_text_generating_male_model = Sequential()
auto_text_generating_male_model.add(LSTM(len(character_set_male),512,return_sequences=True))
auto_text_generating_male_model.add(Dropout(0.2))
auto_text_generating_male_model.add(LSTM(512,512,return_sequences=False))
auto_text_generating_male_model.add(Dropout(0.2))
auto_text_generating_male_model.add(Dense(512,len(character_set_male)))
auto_text_generating_male_model.add(Activation('sigmoid'))

In [None]:
auto_text_generating_male_model.compile(loss='mean_squared_error',optimizer='sgd')

In [None]:
import random,sys

In [None]:
# helper function to sample an index from a probability array
def sample(a, diversity=0.75):
    if random.random() > diversity:
        return np.argmax(a)
    while 1:
        i = random.randint(0, len(a)-1)
        if a[i] > random.random():
            return i

In [None]:
# train the model, output generated text after each iteration
for iteration in range(1,10):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    auto_text_generating_male_model.fit(x_male, y_male, batch_size=128, nb_epoch=1)

    start_index = random.randint(0, len(male_post) - maxlen - 1)

    for diversity in [0.2, 0.4, 0.6, 0.8]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = male_post[start_index : start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')

        for iteration in range(400):
            try:
                x = np.zeros((1, maxlen, len(character_set_male)))
                for t, char in enumerate(sentence):
                    x[0, t, char_indices[char]] = 1.

                preds = auto_text_generating_male_model.predict(x, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                #sys.stdout.write(next_char)
                #sys.stdout.flush()
            except:
                continue
                
        print sentence
        print()