In [1]:
import boto3
import pandas as pd

In [2]:
s3 = boto3.resource(service_name='s3')
obj = s3.Bucket('midstestbox').Object('ISEAR_clean.csv').get()
df = pd.read_csv(obj['Body'], sep=",")

df.dropna(inplace=True)

print(df.shape)

(7666, 2)


In [3]:
df.head()

Unnamed: 0,emotion,text
0,joy,"During the period of falling in love, each tim..."
1,fear,When I was involved in a traffic accident.
2,anger,When I was driving home after several days of...
3,sadness,When I lost the person who meant the most to me.
4,disgust,The time I knocked a deer down - the sight of ...


In [4]:
from sklearn.model_selection import train_test_split

training_data, testing_data, y_train, y_test = train_test_split(df.text, df.emotion, test_size=0.3, random_state=123, shuffle=True)

print("training size {}".format(len(training_data)))
print("testing size {}".format(len(testing_data)))

training size 5366
testing size 2300


In [5]:
# Vectorilize  
from sklearn.feature_extraction.text import TfidfVectorizer

# extract features
vectorizer1 = TfidfVectorizer(stop_words = "english")
x_train = vectorizer1.fit_transform(training_data)

# Use training data's vocabulary to create test tf-idf matrix
vectorizer2 = TfidfVectorizer(stop_words = "english",vocabulary=vectorizer1.vocabulary_)
x_test = vectorizer2.fit_transform(testing_data)


In [6]:
# save the tf_idf vectorizer1
import pickle

with open("tf_idf_vec.pkl", 'wb') as handle:
     pickle.dump(vectorizer1, handle)

In [7]:
# Save to S3
s3_client = boto3.client('s3')

response = s3_client.upload_file("tf_idf_vec.pkl", 'sentiment-model2020', "tf_idf_vec.pkl")

In [8]:
print("Size of training")
print(x_train.shape, x_test.shape)

Size of training
(5366, 7307) (2300, 7307)


In [9]:
# Take LogisticRegression as a baseline

from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(random_state=0,max_iter=200 , solver='saga', penalty='elasticnet', l1_ratio=0.005).fit(x_train, y_train)
predict_emotion = model_lr.predict(x_test)



In [10]:
with open("model_lr_v1.pkl", 'wb') as handle:
     pickle.dump(model_lr, handle)
        
response = s3_client.upload_file("model_lr_v1.pkl", 'sentiment-model2020', "model_lr_v1.pkl")

In [11]:
res = pd.DataFrame( data = {"answer" : y_test, "predict" : predict_emotion})
res.head()

Unnamed: 0,answer,predict
5163,anger,anger
1397,guilt,guilt
6359,joy,joy
1141,anger,disgust
2300,sadness,joy


In [12]:
# Model assessment

from sklearn.metrics import *

precision = precision_score(y_test, predict_emotion, average='micro' )
recall = recall_score(y_test, predict_emotion, average='micro' )
f1 = f1_score(y_test, predict_emotion, average='micro' )

print("TF-IDF  +  LogisticRegression Result: ")

print("Precision: {0:0.4f}".format(precision))
print("Recall: {0:0.4f}".format(recall) )
print("LogisticRegression  f1: {0:0.4f}".format(f1) )


TF-IDF  +  LogisticRegression Result: 
Precision: 0.5465
Recall: 0.5465
LogisticRegression  f1: 0.5465


In [14]:
'''
Method 3
@kernel: DecisionTreeClassifier
'''
from sklearn.tree import DecisionTreeClassifier
 
 
test_pred = model_lr.predict(x_test)
print(classification_report(y_test, test_pred))

NameError: name 'clf_res' is not defined