# Second & Third Part

### 1. Data preprocessing
In this part, the main goal is to seperate training and testing data, filter out useless information, and combine tweets with emotions as labels.

In [1]:
import json
import pandas as pd

In [2]:
#data_identification.csv
train_test_set = pd.read_csv("data_identification.csv",
                         header=None, names=["id", "identification"])

In [3]:
train_test_set.head()

Unnamed: 0,id,identification
0,tweet_id,identification
1,0x28cc61,test
2,0x29e452,train
3,0x2b3819,train
4,0x2db41f,test


In [4]:
#drop the first title row
train_test_set = train_test_set.drop([0], axis=0)

In [5]:
train_test_set.head()

Unnamed: 0,id,identification
1,0x28cc61,test
2,0x29e452,train
3,0x2b3819,train
4,0x2db41f,test
5,0x2a2acc,train


In [6]:
train_ids = train_test_set[train_test_set['identification']=='train'].filter(['id'])

In [7]:
test_ids = train_test_set[train_test_set['identification']=='test'].filter(['id'])

In [8]:
train_ids[:10]

Unnamed: 0,id
2,0x29e452
3,0x2b3819
5,0x2a2acc
6,0x2a8830
7,0x20b21d
8,0x2452cf
9,0x2d729d
10,0x2ab56d
11,0x1f3657
12,0x1fcc53


In [9]:
len(train_ids)

1455563

In [10]:
len(test_ids)

411972

In [11]:
#emotion.csv
emotion_set = pd.read_csv("emotion.csv",
                         header=None, names=["id", "emotion"])

In [12]:
#drop the first title row
emotion_set = emotion_set.drop([0], axis=0)

In [13]:
emotion_set[:10]

Unnamed: 0,id,emotion
1,0x3140b1,sadness
2,0x368b73,disgust
3,0x296183,anticipation
4,0x2bd6e1,joy
5,0x2ee1dd,anticipation
6,0x34cd80,joy
7,0x33f099,sadness
8,0x2ae7b7,sadness
9,0x2408d4,trust
10,0x2b193b,sadness


In [14]:
len(emotion_set)

1455563

In [15]:
## save to pickle file
emotion_set.to_pickle("train_df.pkl") 

In [16]:
#tweets_DM.json
tweets_df = pd.read_json("tweets_DM.json", lines=True)

In [17]:
tweets_df.head()

Unnamed: 0,_score,_index,_source,_crawldate,_type
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets
2,232,hashtag_tweets,"{'tweet': {'hashtags': ['bibleverse'], 'tweet_...",2017-12-25 04:39:20,tweets
3,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets
4,989,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2de2...",2016-01-08 17:18:59,tweets


In [18]:
tweets_source_df = tweets_df['_source']

In [19]:
tweets_df['text'] = tweets_source_df.apply(lambda x: x['tweet']['text'])

In [20]:
tweets_df['id'] = tweets_df['_source'].apply(lambda x: x['tweet']['tweet_id'])

In [21]:
filter_tweets_df = tweets_df.filter(['text','id'])

In [22]:
filter_tweets_df.head()

Unnamed: 0,text,id
0,"People who post ""add me on #Snapchat"" must be ...",0x376b20
1,"@brianklaas As we see, Trump is dangerous to #...",0x2d5350
2,"Confident of your obedience, I write to you, k...",0x28b412
3,Now ISSA is stalking Tasha 😂😂😂 <LH>,0x1cd5b0
4,"""Trust is not the same as faith. A friend is s...",0x2de201


In [23]:
train_tweets_df = filter_tweets_df[filter_tweets_df['id'].isin(train_ids['id'])]

In [24]:
test_tweets_df = filter_tweets_df[~filter_tweets_df['id'].isin(train_ids['id'])]

In [25]:
len(train_tweets_df)

1455563

In [26]:
len(test_tweets_df)

411972

In [27]:
train_tweets_df.head()

Unnamed: 0,text,id
0,"People who post ""add me on #Snapchat"" must be ...",0x376b20
1,"@brianklaas As we see, Trump is dangerous to #...",0x2d5350
3,Now ISSA is stalking Tasha 😂😂😂 <LH>,0x1cd5b0
5,@RISKshow @TheKevinAllison Thx for the BEST TI...,0x1d755c
6,Still waiting on those supplies Liscus. <LH>,0x2c91a8


In [28]:
test_tweets_df.head()

Unnamed: 0,text,id
2,"Confident of your obedience, I write to you, k...",0x28b412
4,"""Trust is not the same as faith. A friend is s...",0x2de201
9,When do you have enough ? When are you satisfi...,0x218443
30,"God woke you up, now chase the day #GodsPlan #...",0x2939d5
33,"In these tough times, who do YOU turn to as yo...",0x26289a


In [29]:
#add emotion labels to train dataframe
train_tweets_df = train_tweets_df.merge(emotion_set)

In [30]:
## save to pickle file
train_tweets_df.to_pickle("train_tweets_df.pkl") 
test_tweets_df.to_pickle("test_tweets_df.pkl")

### 2. Deep Learning

The network and training processing below is following deep learning example in lab2, but I adjust the number of feature, training data and epoch in order to get better result.

#### For 300 features:
#### (1) Transform sentences into 300 features bag-of-words.

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
train_df = pd.read_pickle("train_tweets_df.pkl")

In [33]:
test_df = pd.read_pickle("test_tweets_df.pkl")

In [34]:
import nltk
BOW_vectorizer = CountVectorizer(max_features=300, tokenizer=nltk.word_tokenize)

In [35]:
num_of_train = 500000
# 1. Learn a vocabulary dictionary of all tokens in the raw documents.
BOW_vectorizer.fit(train_df['text'][:num_of_train])

# 2. Transform documents to document-term matrix.
X_train = BOW_vectorizer.transform(train_df['text'][:num_of_train])
y_train = train_df['emotion'][:num_of_train]
X_test = BOW_vectorizer.transform(train_df['text'][1400000:])
y_test = train_df['emotion'][1400000:]



#### (2) Build neural network
I translate labels into one-hot encoding.

In [37]:
import keras

## check dimension is a good habbit 
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)

X_train.shape:  (500000, 300)
y_train.shape:  (500000,)
X_test.shape:  (55563, 300)
y_test.shape:  (55563,)


In [38]:
## deal with label (string -> one-hot)

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print('check label: ', label_encoder.classes_)
print('\n## Before convert')
print('y_train[0:7]:\n', y_train[0:7])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_test = label_encode(label_encoder, y_test)

print('\n\n## After convert')
print('y_train[0:7]:\n', y_train[0:7])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']

## Before convert
y_train[0:7]:
 0    anticipation
1         sadness
2            fear
3             joy
4    anticipation
5             joy
6         sadness
Name: emotion, dtype: object

y_train.shape:  (500000,)
y_test.shape:  (55563,)


## After convert
y_train[0:7]:
 [[0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]]

y_train.shape:  (500000, 8)
y_test.shape:  (55563, 8)


In [39]:
# I/O check
input_shape = X_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  300
output_shape:  8


This network includes 2 ReLU hidden layers, and a softmax function to transform output from output layer. Besides, the loss function and optimizer is cross entropy and adam, respectively.

In [40]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax

# input layer
model_input = Input(shape=(input_shape, ))  # 500
X = model_input

# 1st hidden layer
X_W1 = Dense(units=64)(X)  # 64
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=64)(H1)  # 64
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 4
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# show model construction
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 300)]             0         
_________________________________________________________________
dense (Dense)                (None, 64)                19264     
_________________________________________________________________
re_lu (ReLU)                 (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
re_lu_1 (ReLU)               (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 520       
_________________________________________________________________
softmax (Softmax)            (None, 8)                

#### (3) Train the network

In [41]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('logs/training_log.csv')

# training setting
epochs = 25
batch_size = 32

# training!
history = model.fit(X_train, y_train, 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    callbacks=[csv_logger],
                    validation_data = (X_test, y_test))
print('training finish')
model.save('BOW300_500k.h5')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
training finish


In [48]:
y_test_pred = model.predict(X_test)

In [49]:
from sklearn.metrics import accuracy_score

print('testing accuracy: {}'.format(round(accuracy_score(label_decode(label_encoder, y_test), label_decode(label_encoder,y_test_pred)), 2)))

testing accuracy: 0.47


  if diff:
  if diff:


#### (4)  Predict and write submission

In [None]:
X_unknown = BOW_vectorizer.transform(test_df['text'])

In [44]:
y_unknown_pred = model.predict(X_unknown)

In [45]:
y_unknown_pred = label_decode(label_encoder, y_unknown_pred)

  if diff:


In [46]:
predict_test = {
    "id":list(test_df['id']),
    "emotion":list(y_unknown_pred)
}

In [47]:
predict_test = pd.DataFrame(predict_test)
predict_test.to_csv('BOW300_500k.csv',index=False)

#### Using 500 features and more training data

In [46]:
import nltk
BOW_vectorizer = CountVectorizer(max_features=500, tokenizer=nltk.word_tokenize)

In [47]:
# 1. Learn a vocabulary dictionary of all tokens in the raw documents.
num_of_train = int(len(train_df)*3/4)
BOW_vectorizer.fit(train_df['text'][:num_of_train])

# 2. Transform documents to document-term matrix.
X_train = BOW_vectorizer.transform(train_df['text'][:num_of_train])
y_train = train_df['emotion'][:num_of_train]
X_test = BOW_vectorizer.transform(train_df['text'][num_of_train:])
y_test = train_df['emotion'][num_of_train:]



In [48]:
import keras

## check dimension is a good habbit 
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)

X_train.shape:  (1091672, 500)
y_train.shape:  (1091672,)
X_test.shape:  (363891, 500)
y_test.shape:  (363891,)


In [49]:
## deal with label (string -> one-hot)

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_test = label_encode(label_encoder, y_test)

In [50]:
# I/O check
input_shape = X_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  500
output_shape:  8


In [51]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax

# input layer
model_input = Input(shape=(input_shape, ))  # 500
X = model_input

# 1st hidden layer
X_W1 = Dense(units=64)(X)  # 64
H1 = ReLU()(X_W1)

# 2nd hidden layer
H1_W2 = Dense(units=64)(H1)  # 64
H2 = ReLU()(H1_W2)

# output layer
H2_W3 = Dense(units=output_shape)(H2)  # 4
H3 = Softmax()(H2_W3)

model_output = H3

# create model
model = Model(inputs=[model_input], outputs=[model_output])

# loss function & optimizer
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [52]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('logs/training_log.csv')

# training setting
epochs = 9
batch_size = 32


# training!
history = model.fit(X_train, y_train, 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    callbacks=[csv_logger],
                    validation_data = (X_test, y_test))
print('training finish')
model.save('BOW500_1350k_e9.h5')

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
training finish


In [53]:
y_test_pred = model.predict(X_test)

In [54]:
from sklearn.metrics import accuracy_score
import numpy as np
print('testing accuracy: {}'.format(round(accuracy_score(label_decode(label_encoder, y_test), label_decode(label_encoder,y_test_pred)), 2)))

testing accuracy: 0.5


The result is better. Upload this one to competition.

In [55]:
X_unknown = BOW_vectorizer.transform(test_df['text'])

In [56]:
y_unknown_pred = model.predict(X_unknown)

In [57]:
y_unknown_pred = label_decode(label_encoder, y_unknown_pred)

In [58]:
predict_test = {
    "id":list(test_df['id']),
    "emotion":list(y_unknown_pred)
}

In [59]:
predict_test = pd.DataFrame(predict_test)
predict_test.to_csv('submission.csv',index=False)