In [24]:
!pip install textblob

Collecting textblob
[?25l  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
[K    100% |████████████████████████████████| 645kB 29.8MB/s ta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.15.3
[33mYou are using pip version 19.0.2, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [36]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import *
from keras.utils import np_utils
from nltk.tokenize import RegexpTokenizer
from keras.models import Model,load_model
import textblob
from keras.callbacks import ModelCheckpoint
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.callbacks import Callback
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
from keras.models import Model
from sklearn.preprocessing import OneHotEncoder
import re
%matplotlib inline

In [2]:
df_train = pd.read_csv('./dataset/hm_train.csv')
df_test = pd.read_csv('./dataset/hm_test.csv')

In [3]:
df_train.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,affection
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise
3,27676,24h,We had a serious talk with some friends of our...,2,bonding
4,27677,24h,I went with grandchildren to butterfly display...,1,affection


In [4]:
df_test.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence
0,88305,3m,I spent the weekend in Chicago with my friends.,1
1,88306,3m,We moved back into our house after a remodel. ...,2
2,88307,3m,My fiance proposed to me in front of my family...,1
3,88308,3m,I ate lobster at a fancy restaurant with some ...,1
4,88309,3m,I went out to a nice restaurant on a date with...,5


In [5]:
print(df_train.shape)
print(df_test.shape)

(60321, 5)
(40213, 4)


In [6]:
labels = df_train['predicted_category']
print(np.unique(labels,return_counts=True))

(array(['achievement', 'affection', 'bonding', 'enjoy_the_moment',
       'exercise', 'leisure', 'nature'], dtype=object), array([20274, 20880,  6561,  6508,   729,  4242,  1127]))


In [7]:
label_dict = { 0 : 'achievement', 1 : 'affection', 2 : 'bonding', 3 : 'enjoy_the_moment', 4 : 'exercise', 5 : 'leisure', 6 : 'nature'}

In [8]:
rev_mapping = { 'achievement' : 0, 'affection' : 1, 'bonding' : 2, 'enjoy_the_moment' : 3, 'exercise' : 4, 'leisure' : 5, 'nature' : 6}

In [9]:
df_train['predicted_category'].replace(rev_mapping, inplace=True)

In [10]:
df_train.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,1
1,27674,24h,I was happy when my son got 90% marks in his e...,1,1
2,27675,24h,I went to the gym this morning and did yoga.,1,4
3,27676,24h,We had a serious talk with some friends of our...,2,2
4,27677,24h,I went with grandchildren to butterfly display...,1,1


In [11]:
data = df_train.values

In [12]:
train = pd.DataFrame()
test = pd.DataFrame()

In [13]:
train['Phrase'] = df_train['cleaned_hm']
train['Sentiment'] = df_train['predicted_category']
train['SentenceId'] = df_train['hmid']

test['Phrase'] = df_test['cleaned_hm']
test['SentenceId'] = df_test['hmid']
print (train.shape, test.shape)

(60321, 3) (40213, 2)


In [14]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

train['Phrase'] = train['Phrase'].map(lambda com : clean_text(com))
test['Phrase'] = test['Phrase'].map(lambda com : clean_text(com))

In [15]:
cv1 = CountVectorizer()
cv1.fit(train["Phrase"])

cv2 = CountVectorizer()
cv2.fit(test["Phrase"])

print("Train Set Vocabulary Size:", len(cv1.vocabulary_))
print("Test Set Vocabulary Size:", len(cv2.vocabulary_))
print("Number of Words that occur in both:", len(set(cv1.vocabulary_.keys()).intersection(set(cv2.vocabulary_.keys()))))

Train Set Vocabulary Size: 20509
Test Set Vocabulary Size: 16946
Number of Words that occur in both: 12141


In [26]:
def transform(df):
    df["word_count"] = df["Phrase"].apply(lambda x: len(x.split()))
    df["has_upper"] = df["Phrase"].apply(lambda x: x.lower() != x)
    df["sentence_end"] = df["Phrase"].apply(lambda x: x.endswith("."))
    df["after_comma"] = df["Phrase"].apply(lambda x: x.startswith(","))
    df["Phrase"] = df["Phrase"].apply(lambda x: x.lower())
    return df

train = transform(train)
test = transform(test)

def getSentFeat(s , polarity):
    sent = textblob.TextBlob(s).sentiment
    if polarity:
        return sent.polarity
    else :
        return sent.subjectivity
    
train['polarity'] = train['Phrase'].apply(lambda x: getSentFeat(x , polarity=True))
train['subjectivity'] = train['Phrase'].apply(lambda x: getSentFeat(x , polarity=False))

test['polarity'] = test['Phrase'].apply(lambda x: getSentFeat(x , polarity=True))
test['subjectivity'] = test['Phrase'].apply(lambda x: getSentFeat(x , polarity=False))

dense_features = ["word_count", "has_upper", "after_comma", "sentence_end" ,"polarity","subjectivity"]

train.groupby("Sentiment")[dense_features].mean()

Unnamed: 0_level_0,word_count,has_upper,after_comma,sentence_end,polarity,subjectivity
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,16.812913,False,False,False,0.190756,0.414953
1,21.344157,False,False,False,0.239366,0.429889
2,19.489864,False,False,False,0.257471,0.394704
3,21.589121,False,False,False,0.334586,0.544569
4,13.489712,False,False,False,0.24167,0.380579
5,12.437294,False,False,False,0.156239,0.382447
6,18.655723,False,False,False,0.295409,0.53962


In [16]:
NUM_FOLDS = 5

train["fold_id"] = train["SentenceId"].apply(lambda x: x%NUM_FOLDS)

In [17]:
EMBEDDING_FILE = "./glove.6B.100d.txt"
EMBEDDING_DIM = 100

all_words = set(cv1.vocabulary_.keys()).union(set(cv2.vocabulary_.keys()))

def get_embedding():
    embeddings_index = {}
    f = open(EMBEDDING_FILE)
    for line in f:
        values = line.split()
        word = values[0]
        if len(values) == EMBEDDING_DIM + 1 and word in all_words:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    f.close()
    return embeddings_index

embeddings_index = get_embedding()
print("Number of words that don't exist in GLOVE:", len(all_words - set(embeddings_index)))

Number of words that don't exist in GLOVE: 2219


In [18]:
MAX_SEQUENCE_LENGTH = 60

tokenizer = Tokenizer(filters="")
tokenizer.fit_on_texts(np.append(train["Phrase"].values, test["Phrase"].values))
word_index = tokenizer.word_index

nb_words = len(word_index) + 1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
seq = pad_sequences(tokenizer.texts_to_sequences(train["Phrase"]), maxlen=MAX_SEQUENCE_LENGTH)
test_seq = pad_sequences(tokenizer.texts_to_sequences(test["Phrase"]), maxlen=MAX_SEQUENCE_LENGTH)
seq.shape

(60321, 60)

In [19]:
def build_model():
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    dropout = SpatialDropout1D(0.2)
    mask_layer = Masking()
    lstm_layer = LSTM(50)
    
    seq_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype="int32")
    dense_input = Input(shape=(len(dense_features),))
    
    dense_vector = BatchNormalization()(dense_input)
    
    phrase_vector = lstm_layer(mask_layer(dropout(embedding_layer(seq_input))))
    
    feature_vector = concatenate([phrase_vector, dense_vector])
    feature_vector = Dense(50, activation="relu")(feature_vector)
    feature_vector = Dense(20, activation="relu")(feature_vector)
    
    output = Dense(7, activation="softmax")(feature_vector)
    
    model = Model(inputs=[seq_input, dense_input], outputs=output)
    return model

In [38]:
enc = OneHotEncoder(sparse=False)
enc.fit(train["Sentiment"].values.reshape(-1, 1))

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=False)

In [39]:
test_preds = np.zeros((test.shape[0], 7))

for i in range(NUM_FOLDS):
    print("FOLD", i+1)
    
    print("Splitting the data into train and validation...")
    train_seq, val_seq = seq[train["fold_id"] != i], seq[train["fold_id"] == i]
    train_dense, val_dense = train[train["fold_id"] != i][dense_features], train[train["fold_id"] == i][dense_features]
    y_train = enc.transform(train[train["fold_id"] != i]["Sentiment"].values.reshape(-1, 1))
    y_val = enc.transform(train[train["fold_id"] == i]["Sentiment"].values.reshape(-1, 1))
    
    print("Building the model...")
    model = build_model()
    model.compile(loss="categorical_crossentropy", optimizer="nadam", metrics=["acc"])
    
    early_stopping = EarlyStopping(monitor="val_acc", patience=2, verbose=1)
    file_path = "best_model.hdf5"
    check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")
    
    print("Training the model...")
    model.fit([train_seq, train_dense], y_train, validation_data=([val_seq, val_dense], y_val),
              epochs=15, batch_size=1024, shuffle=True, callbacks=[check_point,early_stopping], verbose=1)
    
    print("Predicting...")
    test_preds += model.predict([test_seq, test[dense_features]], batch_size=1024, verbose=1)
    print()
    
test_preds /= NUM_FOLDS

FOLD 1
Splitting the data into train and validation...
Building the model...
Training the model...
Train on 48253 samples, validate on 12068 samples
Epoch 1/15

Epoch 00001: val_loss improved from inf to 0.87628, saving model to best_model.hdf5
Epoch 2/15

Epoch 00002: val_loss improved from 0.87628 to 0.45242, saving model to best_model.hdf5
Epoch 3/15

Epoch 00003: val_loss improved from 0.45242 to 0.37450, saving model to best_model.hdf5
Epoch 4/15

Epoch 00004: val_loss improved from 0.37450 to 0.33338, saving model to best_model.hdf5
Epoch 5/15

Epoch 00005: val_loss improved from 0.33338 to 0.30094, saving model to best_model.hdf5
Epoch 6/15

Epoch 00006: val_loss did not improve from 0.30094
Epoch 7/15

Epoch 00007: val_loss did not improve from 0.30094
Epoch 8/15

Epoch 00008: val_loss did not improve from 0.30094
Epoch 9/15

Epoch 00009: val_loss improved from 0.30094 to 0.27127, saving model to best_model.hdf5
Epoch 10/15

Epoch 00010: val_loss did not improve from 0.27127
Ep


Epoch 00008: val_loss improved from 0.29712 to 0.29501, saving model to best_model.hdf5
Epoch 9/15

Epoch 00009: val_loss did not improve from 0.29501
Epoch 10/15

Epoch 00010: val_loss improved from 0.29501 to 0.28824, saving model to best_model.hdf5
Epoch 11/15

Epoch 00011: val_loss did not improve from 0.28824
Epoch 12/15

Epoch 00012: val_loss did not improve from 0.28824
Epoch 13/15

Epoch 00013: val_loss did not improve from 0.28824
Epoch 14/15

Epoch 00014: val_loss did not improve from 0.28824
Epoch 00014: early stopping
Predicting...

FOLD 5
Splitting the data into train and validation...
Building the model...
Training the model...
Train on 48269 samples, validate on 12052 samples
Epoch 1/15

Epoch 00001: val_loss improved from inf to 0.82368, saving model to best_model.hdf5
Epoch 2/15

Epoch 00002: val_loss improved from 0.82368 to 0.46802, saving model to best_model.hdf5
Epoch 3/15

Epoch 00003: val_loss improved from 0.46802 to 0.38480, saving model to best_model.hdf5
Epo

In [43]:
print("Select the class with the highest probability as prediction...")
test["pred"] = test_preds.argmax(axis=1)


print("Make the submission ready...")
test["Sentiment"] = test["pred"].astype(int)

Select the class with the highest probability as prediction...
Make the submission ready...


In [44]:
ans_final = pd.DataFrame()

In [45]:
ans_final['hmid'] = test['SentenceId']
ans_final['predicted_category'] = test['Sentiment']

In [47]:
ans_final['predicted_category'].replace(label_dict, inplace=True)

In [48]:
ans_final.head()

Unnamed: 0,hmid,predicted_category
0,88305,bonding
1,88306,affection
2,88307,affection
3,88308,bonding
4,88309,affection


In [49]:
ans_final.to_csv('ans_final.csv', sep=',',index=False)