In [11]:
import pandas as pd

from sklearn.cross_validation import train_test_split

import numpy as np

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras import metrics

Using TensorFlow backend.


In [2]:
label_data = pd.read_csv("./data/label_data.csv")

In [3]:
x = label_data['tweet_text']
y = label_data['label_n']

In [4]:
#Pre-processing
import re
x = x.apply(lambda z: re.sub(u'http\S+', u'', z)) 
x = x.apply(lambda z: re.sub(u'(\s)@\w+', u'', z))
x = x.apply(lambda z: re.sub(u'#', u'', z))
x = x.apply(lambda z: re.sub(u'RT', u'', z))

In [5]:
x[150:155]

150    Nepal Quake: India officials confirmed 11 deat...
151    NepalQuakeRelief Volunteer request IndiaWithNe...
152      Nepal earthquake: more than a hundred people...
153    happy : NepalQuake | Deep Kumar Upadhyay, Nepa...
154                                             Awful.  
Name: tweet_text, dtype: object

In [6]:
y.head()

0    3
1    2
2    3
3    3
4    3
Name: label_n, dtype: int64

In [8]:
#Random Test-Train Split

SEED = None
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=SEED)

In [9]:
#TFIDF Vector

from sklearn.feature_extraction.text import TfidfVectorizer

tvec1 = TfidfVectorizer(max_features=1000,ngram_range=(1, 3),analyzer='word',norm='l2',stop_words='english')
tvec1.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [10]:
x_train_tfidf = tvec1.transform(x_train)
x_test_tfidf =  tvec1.transform(x_test).toarray()

In [12]:
y_train_one_hot = keras.utils.to_categorical(y_train, num_classes=4)
y_test_one_hot =  keras.utils.to_categorical(y_test, num_classes=4)

In [13]:
model = Sequential()
model.add(Dense(256, activation='relu', input_dim=1000))
model.add(Dropout(0.05))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.10))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               256256    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 260       
Total para

In [15]:
model.fit(x_train_tfidf, y_train_one_hot, epochs=5, batch_size=256, validation_data = (x_test_tfidf, y_test_one_hot))

Train on 16675 samples, validate on 1853 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x15d6a387668>

In [16]:
x_tfidf = tvec1.transform(x)
y_one_hot =  keras.utils.to_categorical(y, num_classes=4)

In [17]:
model.fit(x_tfidf, y_one_hot, epochs=5, batch_size=256, validation_data = (x_test_tfidf, y_test_one_hot))

Train on 18528 samples, validate on 1853 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x15d6a928dd8>

In [18]:
model.save('multi_classification.h5')  # creates a HDF5 file 'my_model.h5'