In [1]:
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import os
import re
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
import statsmodels.api as sm
%matplotlib inline

In [5]:
pwd

'/Users/skylark'

In [7]:
cd /Users/skylark/Desktop/blogs

/Users/skylark/Desktop/blogs


In [13]:
file_names = []
for file in os.listdir('/Users/skylark/Desktop/blogs'):
    file_names.append(file)

In [15]:
len(file_names)

19320

In [17]:
os.getcwd()

'/Users/skylark/Desktop/blogs'

In [18]:
os.path.join(os.getcwd(), file_names[0])

'/Users/skylark/Desktop/blogs/4162441.male.16.Student.Sagittarius.xml'

In [19]:
contents = []
file_details =[]
for file in file_names:
    try:
        with open(os.path.join(os.getcwd(), file)) as f:
            content = f.readlines()
        contents.append(content)
        file_details.append(file)
    except Exception as e:
        pass    

In [21]:
len(contents)

18713

In [22]:
len(file_details)

18713

In [27]:
ID = []
gender = []
age = []
industry = []
astrological_sign = []

In [28]:
for detail in file_details:
    ID.append(detail.split('.')[0])
    gender.append(detail.split('.')[1])
    age.append(detail.split('.')[2])
    industry.append(detail.split('.')[3])
    astrological_sign.append(detail.split('.')[4])

### Preparing dataset

In [30]:
data = pd.DataFrame({'ID':ID, 'text':contents, 'Gender':gender, 'Age':age, 'Industry':industry, 'Sun_Sign':astrological_sign})

In [41]:
data['text'] = data.text.map(lambda x: x[6:])

In [48]:
data['text'] = data.text.map(lambda x: ' '.join(x))

In [58]:
pat1 = re.compile(r'\w+')
pat2 = re.compile(r'\D')

In [56]:
data['new_text'] = data.text.map(lambda x: ' '.join(pat1.findall(x.lower())))

In [66]:
data['text'] = data.new_text.map(lambda x: ''.join(pat2.findall(x.lower())))

In [69]:
data.drop('new_text', axis=1, inplace=True)

In [70]:
data

Unnamed: 0,ID,text,Gender,Age,Industry,Sun_Sign
0,4162441,destiny you might not say anything but i can h...,male,16,Student,Sagittarius
1,3489929,it s been a long time coming but i have made s...,female,25,Student,Cancer
2,3954575,so here i sit at work only three more hours le...,female,23,BusinessServices,Gemini
3,3364931,today was normal nothing much to talk about ex...,male,16,Student,Virgo
4,3162067,i feel it in the water the crystal vibrations ...,female,24,Education,Cancer
...,...,...,...,...,...,...
18708,3591972,howdy yay my first journal entry ever woo but ...,female,16,Student,Capricorn
18709,228004,to prevent boredom setting in from seeing the ...,male,25,indUnk,Virgo
18710,3902987,wahaha three days nv cum in liao gt many prese...,male,13,indUnk,Pisces
18711,3724623,haiz today heard sth stupid den haiz stupid si...,male,15,Student,Leo


In [88]:
data.Industry.unique()

array(['Student', 'BusinessServices', 'Education', 'indUnk', 'Technology',
       'Arts', 'Non-Profit', 'InvestmentBanking', 'Engineering',
       'Science', 'Publishing', 'Construction', 'HumanResources',
       'Communications-Media', 'Internet', 'Banking', 'Biotech',
       'Architecture', 'Advertising', 'Military', 'Chemicals', 'Fashion',
       'Law', 'Tourism', 'Museums-Libraries', 'Accounting',
       'Transportation', 'Agriculture', 'Government', 'Marketing',
       'Manufacturing', 'Religion', 'Sports-Recreation',
       'Telecommunications', 'Consulting', 'RealEstate', 'Automotive',
       'Maritime', 'LawEnforcement-Security', 'Environment'], dtype=object)

In [75]:
data.Sun_Sign.unique()

array(['Sagittarius', 'Cancer', 'Gemini', 'Virgo', 'Leo', 'Taurus',
       'Scorpio', 'Aries', 'Pisces', 'Capricorn', 'Libra', 'Aquarius'],
      dtype=object)

In [78]:
data.Age.describe()

count    18713.000000
mean        22.761823
std          7.992861
min         13.000000
25%         16.000000
50%         23.000000
75%         26.000000
max         48.000000
Name: Age, dtype: float64

In [82]:
np.percentile(data.Age, [25,50,75])

array([16., 23., 26.])

In [140]:
data['age_grp'] = data.Age.map(lambda x: 0 if x<16 else 1 if x>=16 and x<23 else 2 if x>=23 and x<26 else 3)

In [94]:
industry_dict = {}
rev_industry_dict = {}
for i,j in enumerate(set(data.Industry.unique())):
    industry_dict[j] = i
    rev_industry_dict[i] = j

In [91]:
sun_sign = {}
rev_sun_sign = {}
for i,j in enumerate(set(data.Sun_Sign.unique())):
    sun_sign[j] = i
    rev_sun_sign[i] = j

In [141]:
data

Unnamed: 0,ID,text,Age,Industry,Sun_Sign,age_grp,Gender_male
0,4162441,destiny you might not say anything but i can h...,16,12,9,1,1
1,3489929,it s been a long time coming but i have made s...,25,12,4,2,0
2,3954575,so here i sit at work only three more hours le...,23,21,7,2,0
3,3364931,today was normal nothing much to talk about ex...,16,12,11,1,1
4,3162067,i feel it in the water the crystal vibrations ...,24,0,4,2,0
...,...,...,...,...,...,...,...
18708,3591972,howdy yay my first journal entry ever woo but ...,16,12,8,1,0
18709,228004,to prevent boredom setting in from seeing the ...,25,23,11,2,1
18710,3902987,wahaha three days nv cum in liao gt many prese...,13,23,0,0,1
18711,3724623,haiz today heard sth stupid den haiz stupid si...,15,12,2,0,1


### Preprocessing

In [111]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors

In [151]:
toknzr = Tokenizer(num_words=10000)

In [152]:
toknzr.fit_on_texts(data.text)

In [153]:
seq = toknzr.texts_to_sequences(data.text)

In [154]:
seq = pad_sequences(seq, maxlen=500)

In [155]:
word_idx = toknzr.word_index

In [None]:
data['Industry'] = data.Industry.map(lambda x: industry_dict[x])
data['Sun_Sign'] = data.Sun_Sign.map(lambda x: sun_sign[x])

In [186]:
data

Unnamed: 0,ID,text,Age,Industry,Sun_Sign,age_grp,Gender_male
0,4162441,destiny you might not say anything but i can h...,16,12,9,1,1
1,3489929,it s been a long time coming but i have made s...,25,12,4,2,0
2,3954575,so here i sit at work only three more hours le...,23,21,7,2,0
3,3364931,today was normal nothing much to talk about ex...,16,12,11,1,1
4,3162067,i feel it in the water the crystal vibrations ...,24,0,4,2,0
...,...,...,...,...,...,...,...
18708,3591972,howdy yay my first journal entry ever woo but ...,16,12,8,1,0
18709,228004,to prevent boredom setting in from seeing the ...,25,23,11,2,1
18710,3902987,wahaha three days nv cum in liao gt many prese...,13,23,0,0,1
18711,3724623,haiz today heard sth stupid den haiz stupid si...,15,12,2,0,1


#### Word2Vec

In [158]:
word2vec = KeyedVectors.load_word2vec_format('/Users/skylark/Desktop/Misc/pretrained embeddings/GoogleNews-vectors-negative300.bin', binary=True, limit=10000)

In [159]:
embedding_matrix_word2vec = np.zeros((10000, 300))
for word, i in word_idx.items():
    if i<10000:
        try:
            vec = word2vec[word]
            embedding_matrix_word2vec[i] = vec
        except Exception as e:
            pass

#### GloVe

In [160]:
with open('/Users/skylark/Desktop/Misc/pretrained embeddings/glove.6B/glove.6B.300d.txt') as f:
    glove = f.readlines()

In [161]:
glove_weight = {}
for wt in glove:
    glove_weight[wt.split()[0]] = np.array(wt.split()[1:])   

In [162]:
embedding_matrix_glove = np.zeros((10000, 300))
for word, i in word_idx.items():
    if i<10000:
        try:
            vec = glove_weight[word]
            embedding_matrix_glove[i] = vec
        except Exception as e:
            pass

In [163]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(seq, data[['Industry','Sun_Sign','age_grp','Gender_male']], test_size=0.3, random_state=42)

In [171]:
len(data.Industry.unique())

40

### Modelling

In [164]:
from keras import models, layers, regularizers, optimizers

#### Word2vec

##### Gender

In [165]:
model = models.Sequential()

In [166]:
model.add(layers.Embedding(10000, 300, input_length=500))
model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(256)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 300)          3000000   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 500, 128)          186880    
_________________________________________________________________
bidirectional_8 (Bidirection (None, 500, 256)          263168    
_________________________________________________________________
bidirectional_9 (Bidirection (None, 512)               1050624   
_________________________________________________________________
dense_7 (Dense)              (None, 128)               65664     
_________________________________________________________________
dense_8 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                

In [167]:
model.compile(optimizer=optimizers.RMSprop(2e-4), loss='binary_crossentropy', metrics=['acc'])
history = model.fit(train_x, train_y.Gender_male, epochs=1, batch_size=32, validation_split=0.2)

Train on 10479 samples, validate on 2620 samples
Epoch 1/1


In [168]:
model.evaluate(test_x, test_y.Gender_male)



[0.6415674543100445, 0.6635197997093201]

##### Industry

In [174]:
model = models.Sequential()

In [175]:
model.add(layers.Embedding(10000, 300, input_length=500))
model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(256)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(40, activation='softmax'))

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 300)          3000000   
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 500, 128)          186880    
_________________________________________________________________
bidirectional_14 (Bidirectio (None, 500, 256)          263168    
_________________________________________________________________
bidirectional_15 (Bidirectio (None, 512)               1050624   
_________________________________________________________________
dense_13 (Dense)             (None, 128)               65664     
_________________________________________________________________
dense_14 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_15 (Dense)             (None, 40)               

In [176]:
model.compile(optimizer=optimizers.RMSprop(2e-4), loss='sparse_categorical_crossentropy', metrics=['acc'])
history = model.fit(train_x, train_y.Industry, epochs=1, batch_size=32, validation_split=0.2)

Train on 10479 samples, validate on 2620 samples
Epoch 1/1


In [178]:
model.evaluate(test_x, test_y.Industry)



[2.297132601931633, 0.34075525403022766]

##### Sun Sign

In [181]:
model = models.Sequential()

In [182]:
model.add(layers.Embedding(10000, 300, input_length=500))
model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(256)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(12, activation='softmax'))

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 300)          3000000   
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 500, 128)          186880    
_________________________________________________________________
bidirectional_17 (Bidirectio (None, 500, 256)          263168    
_________________________________________________________________
bidirectional_18 (Bidirectio (None, 512)               1050624   
_________________________________________________________________
dense_16 (Dense)             (None, 128)               65664     
_________________________________________________________________
dense_17 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_18 (Dense)             (None, 12)               

In [183]:
model.compile(optimizer=optimizers.RMSprop(2e-4), loss='sparse_categorical_crossentropy', metrics=['acc'])
history = model.fit(train_x, train_y.Sun_Sign, epochs=1, batch_size=32, validation_split=0.2)

Train on 10479 samples, validate on 2620 samples
Epoch 1/1


##### Age Group

In [184]:
model = models.Sequential()

In [187]:
model.add(layers.Embedding(10000, 300, input_length=500))
model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(256)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(4, activation='softmax'))

model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 500, 300)          3000000   
_________________________________________________________________
bidirectional_19 (Bidirectio (None, 500, 128)          186880    
_________________________________________________________________
bidirectional_20 (Bidirectio (None, 500, 256)          263168    
_________________________________________________________________
bidirectional_21 (Bidirectio (None, 512)               1050624   
_________________________________________________________________
dense_19 (Dense)             (None, 128)               65664     
_________________________________________________________________
dense_20 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_21 (Dense)             (None, 4)                

In [188]:
model.compile(optimizer=optimizers.RMSprop(2e-4), loss='sparse_categorical_crossentropy', metrics=['acc'])
history = model.fit(train_x, train_y.age_grp, epochs=1, batch_size=32, validation_split=0.2)

Train on 10479 samples, validate on 2620 samples
Epoch 1/1


In [189]:
model.evaluate(test_x, test_y.age_grp)



[1.109371331479569, 0.44246527552604675]

### Functional API

In [221]:
from keras.models import Model
from keras import layers, Input

In [222]:
ip = Input(shape=(None,), dtype='int32',name='Input')

embed = layers.Embedding(10000, 300, input_length=500, name='Word_embeddings')(ip)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, name = 'Bi_LSTM_1'))(embed)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, name = 'Bi_LSTM_2'))(x)
x = layers.Bidirectional(layers.LSTM(256, name = 'Bi_LSTM_3'))(x)

x = layers.Dense(128, activation='relu', name='Dense_1')(x)
x = layers.Dense(256, activation='relu', name='Dense_2')(x)

gender = layers.Dense(1, activation='sigmoid', name='Gender')(x)
industry = layers.Dense(40, activation='softmax', name='Industry')(x)
sun_sign = layers.Dense(12, activation='softmax', name='Sun_Sign')(x)
age_grp = layers.Dense(4, activation='softmax', name='Age_Group')(x)

model = Model(ip, [gender, industry, sun_sign, age_grp])
model.summary()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
Word_embeddings (Embedding)     (None, 500, 300)     3000000     Input[0][0]                      
__________________________________________________________________________________________________
bidirectional_53 (Bidirectional (None, 500, 128)     186880      Word_embeddings[0][0]            
__________________________________________________________________________________________________
bidirectional_54 (Bidirectional (None, 500, 256)     263168      bidirectional_53[0][0]           
___________________________________________________________________________________________

In [223]:
model.layers[1].set_weights([embedding_matrix_glove])
model.layers[1].trainable = False

In [224]:
model.compile(optimizer=optimizers.RMSprop(2e-4), loss=['binary_crossentropy', 'sparse_categorical_crossentropy', 'sparse_categorical_crossentropy', 'sparse_categorical_crossentropy'], metrics=['acc'])

In [225]:
model.fit(train_x, [train_y.Gender_male, train_y.Industry, train_y.Sun_Sign, train_y.age_grp], batch_size=32, epochs=1, validation_split=0.2)

Train on 10479 samples, validate on 2620 samples
Epoch 1/1


<keras.callbacks.callbacks.History at 0x26303c350>

In [226]:
model.evaluate(test_x, [test_y.Gender_male, test_y.Industry, test_y.Sun_Sign, test_y.age_grp])



[6.714425373722915,
 0.6676380634307861,
 2.292409658432007,
 2.490910768508911,
 1.2615333795547485,
 0.5944068431854248,
 0.35411471128463745,
 0.08478803187608719,
 0.3596366345882416]