In [1]:
import boto3
import pandas as pd

In [2]:
s3 = boto3.resource(service_name='s3')
obj = s3.Bucket('midstestbox').Object('ISEAR_clean.csv').get()
df = pd.read_csv(obj['Body'], sep=",")

df.dropna(inplace=True)

print(df.shape)

(7666, 2)


In [3]:
df.head()

Unnamed: 0,emotion,text
0,joy,"During the period of falling in love, each tim..."
1,fear,When I was involved in a traffic accident.
2,anger,When I was driving home after several days of...
3,sadness,When I lost the person who meant the most to me.
4,disgust,The time I knocked a deer down - the sight of ...


In [4]:
df["text"].head()

0    During the period of falling in love, each tim...
1           When I was involved in a traffic accident.
2    When I was driving home after  several days of...
3    When I lost the person who meant the most to me. 
4    The time I knocked a deer down - the sight of ...
Name: text, dtype: object

In [5]:
pip install gensim

Collecting gensim
  Downloading gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 15.7 MB/s eta 0:00:01
Collecting smart-open>=1.8.1
  Downloading smart_open-3.0.0.tar.gz (113 kB)
[K     |████████████████████████████████| 113 kB 68.7 MB/s eta 0:00:01
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py) ... [?25ldone
[?25h  Created wheel for smart-open: filename=smart_open-3.0.0-py3-none-any.whl size=107097 sha256=45e8f5d4b09202e23b558b65a89d66ea2890c667f7e68e5ea79b8c41274254cf
  Stored in directory: /home/ec2-user/.cache/pip/wheels/88/2a/d4/f2e9023989d4d4b3574f268657cb6cd23994665a038803f547
Successfully built smart-open
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.8.3 smart-open-3.0.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the ker

In [6]:
import gensim
from gensim.models import Word2Vec, word2vec

In [7]:
from gensim.parsing.preprocessing import remove_stopwords

sentences = [ remove_stopwords(x).split(" ") for x in df["text"]]

sentences[1]

['When', 'I', 'involved', 'traffic', 'accident.']

In [8]:
#sentences = word2vec.Text8Corpus(obj['Body']) 
#sentences=word2vec.Text8Corpus()

LATENT_FEATURE_SIZE = 50


# load corpus 
model = Word2Vec(sentences, sg=1, size=LATENT_FEATURE_SIZE,  window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=4) 

In [9]:
model.wv.similar_by_word('happy', topn =5)

[('successful', 0.8997143507003784),
 ('learnt', 0.8923747539520264),
 ('exam,', 0.8919605016708374),
 ('exams,', 0.8894598484039307),
 ('failed.', 0.8768682479858398)]

In [10]:
model.save("./ISEAR_w2v.model")
s3.Bucket('midstestbox').upload_file('./ISEAR_w2v.model', 'ISEAR_w2v.mode')

In [11]:
import numpy as np


def sentence_2_vec(sentence):
    ''' 
    @para: sentence : vector of token list
    '''
    vec = np.zeros(LATENT_FEATURE_SIZE).reshape((1, LATENT_FEATURE_SIZE))
    count = 0
    for word in sentence:
        try:
            count += 1
            vec += model[word].reshape((1, LATENT_FEATURE_SIZE))
            # print(vec)
        except KeyError:
            continue
    vec /= count

    return vec[0]


In [12]:
#print(sentence_2_vec(sentences[1]))
 

In [13]:
from sklearn.model_selection import train_test_split


labels = df.emotion.factorize() 
labels_index = labels[1]
df['emotion'] = labels[0]


X_train, X_test, y_train, y_test = train_test_split(df.text, df.emotion, test_size=0.3, random_state=123, shuffle=True)

X_train = [sentence_2_vec(x) for x in X_train]
X_test  = [sentence_2_vec(x) for x in X_test]


print("training size {}".format(len(X_train)))
print("testing size {}".format(len(X_test)))



training size 5366
testing size 2300


In [14]:
#X_train[1]

In [15]:
#y_train[:5]

In [16]:
#X_train

In [18]:
'''
Method 1
@kernel: RBF
'''
from sklearn import svm
from sklearn.metrics import classification_report

clf = svm.SVC() 

clf_res = clf.fit(X_train, y_train)
test_pred = clf_res.predict(X_test)
print(classification_report(y_test, test_pred))

#clf.save("ISEAR_w2v_svm.clf")

              precision    recall  f1-score   support

           0       0.27      0.15      0.19       322
           1       0.16      0.12      0.14       315
           2       0.22      0.37      0.28       337
           3       0.25      0.20      0.22       310
           4       0.16      0.06      0.09       339
           5       0.16      0.02      0.03       338
           6       0.19      0.51      0.28       339

    accuracy                           0.21      2300
   macro avg       0.20      0.20      0.17      2300
weighted avg       0.20      0.21      0.17      2300



In [67]:
'''
Method 2
@kernel: LinearSVC
'''
from sklearn.svm import LinearSVC


clf = svm.LinearSVC() 

clf_res = clf.fit(X_train, y_train)
test_pred = clf_res.predict(X_test)
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.18      0.28      0.22       322
           1       0.23      0.09      0.13       315
           2       0.19      0.51      0.27       337
           3       0.24      0.17      0.20       310
           4       0.17      0.01      0.01       339
           5       0.17      0.04      0.07       338
           6       0.18      0.24      0.20       339

    accuracy                           0.19      2300
   macro avg       0.19      0.19      0.16      2300
weighted avg       0.19      0.19      0.16      2300



In [68]:
'''
Method 3
@kernel: DecisionTreeClassifier
'''
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier() 

clf_res = clf.fit(X_train, y_train)
test_pred = clf_res.predict(X_test)
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.15      0.21      0.18       322
           1       0.12      0.13      0.12       315
           2       0.17      0.15      0.16       337
           3       0.17      0.22      0.19       310
           4       0.15      0.12      0.13       339
           5       0.15      0.13      0.14       338
           6       0.25      0.17      0.20       339

    accuracy                           0.16      2300
   macro avg       0.17      0.16      0.16      2300
weighted avg       0.17      0.16      0.16      2300

