In [1]:
import tensorflow as tf
import pandas as pd
import sagemaker




In [2]:
df = pd.read_csv('Tweets.csv', sep=',')

In [3]:
df.head(5)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


### Select relevant columns

In [4]:
tweet_and_sentiment = df[['text','airline_sentiment']]
tweet_and_sentiment.head(5)

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


### Select only positive and negative tweets

In [5]:
tweet_and_sentiment = tweet_and_sentiment[tweet_and_sentiment['airline_sentiment'] != 'neutral']
tweet_and_sentiment.head(5)

Unnamed: 0,text,airline_sentiment
1,@VirginAmerica plus you've added commercials t...,positive
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
5,@VirginAmerica seriously would pay $30 a fligh...,negative
6,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [6]:
train = tweet_and_sentiment[:-10]
test = tweet_and_sentiment[-10:]

### Convert sentiment label to numeric category

In [7]:
labels = train.airline_sentiment.factorize() 
labels

(array([0, 1, 1, ..., 1, 0, 1]),
 Index(['positive', 'negative'], dtype='object'))

The factorize method converts strings into numeric categories and then keeps string categories as an array of index.  
so in this `Index(['positive', 'negative'], dtype='object'))` positive = 0 and negative = 1

In [8]:
labels_index = labels[1]
train['airline_sentiment'] = labels[0]
train.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,text,airline_sentiment
1,@VirginAmerica plus you've added commercials t...,0
3,@VirginAmerica it's really aggressive to blast...,1
4,@VirginAmerica and it's a really big bad thing...,1
5,@VirginAmerica seriously would pay $30 a fligh...,1
6,"@VirginAmerica yes, nearly every time I fly VX...",0


In [9]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [10]:
sage_maker_session = sagemaker.Session()

In [11]:
prefix = 'tensorflow_sentiment_analysis'
training_input_path = sage_maker_session.upload_data('train.csv', key_prefix=prefix+'/training')

In [12]:
training_input_path

's3://sagemaker-us-east-1-575814765949/tensorflow_sentiment_analysis/training/train.csv'

In [13]:
training_data = pd.read_csv(training_input_path, sep=',')

In [14]:
training_data.head(5)

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica plus you've added commercials t...,0
1,@VirginAmerica it's really aggressive to blast...,1
2,@VirginAmerica and it's a really big bad thing...,1
3,@VirginAmerica seriously would pay $30 a fligh...,1
4,"@VirginAmerica yes, nearly every time I fly VX...",0


In [17]:
%%writefile train.py
import argparse
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense
from tensorflow.keras.layers import Embedding, Dropout
import pandas as pd

if __name__ == '__main__':
    
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--batch-size', type=int, default=100)
    parser.add_argument('--learning-rate', type=float, default=0.1)
    

    parser.add_argument('--gpu-count', type=int, default=os.environ['SM_NUM_GPUS'])

    # input data and model directories
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    #parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])

    args, _ = parser.parse_known_args()
    
    epochs     = args.epochs
    lr         = args.learning_rate
    batch_size = args.batch_size
    gpu_count  = args.gpu_count
    model_dir  = args.model_dir
    training_dir   = args.train
    
    training_data = pd.read_csv(training_dir+'/train.csv',sep=',')
    tweet = training_data.text.values
    labels = training_data.airline_sentiment.values
    
    num_of_words = 5000
    token = Tokenizer(num_words=num_of_words)
    token.fit_on_texts(tweet)
    
    vocab_size = len(token.word_index) + 1 # 1 is added due to 0 index
    
    tweet_sequence = token.texts_to_sequences(tweet)
    
    max_len = 200
    padded_tweet_sequence = pad_sequences(tweet_sequence, maxlen=max_len)
    
    # Build the model
    embedding_vector_length = 32
    model = Sequential() 
    model.add(Embedding(vocab_size, embedding_vector_length, input_length=max_len) )
    model.add(Dropout(0.2))
    model.add(LSTM(100)) 
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid')) 
    model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy']) 
    
    model.fit(padded_tweet_sequence,labels,validation_split=0.3, epochs=epochs, batch_size=batch_size, verbose=2)
    
    tf.saved_model.simple_save(
        tf.keras.backend.get_session(),
        os.path.join(model_dir, '1'),
        inputs={'inputs': model.input},
        outputs={t.name: t for t in model.outputs})



Overwriting train.py


In [18]:
tf_version = tf.__version__
tf_version

'1.15.2'

In [19]:
role = sagemaker.get_execution_role()

In [20]:
from sagemaker.tensorflow import TensorFlow

tf_estimator = TensorFlow(entry_point='train.py', 
                          role=role,
                          train_instance_count=1, 
                          train_instance_type='ml.c5.18xlarge',
                          framework_version=tf_version, 
                          py_version='py3',
                          script_mode=True,
                          hyperparameters={
                              'epochs': 10,
                              'batch-size': 64
                          }
                         )

In [21]:
training_input_path

's3://sagemaker-us-east-1-575814765949/tensorflow_sentiment_analysis/training/train.csv'

In [22]:
tf_estimator.fit({'train': training_input_path})

2020-04-15 00:45:27 Starting - Starting the training job...
2020-04-15 00:45:29 Starting - Launching requested ML instances......
2020-04-15 00:46:35 Starting - Preparing the instances for training...
2020-04-15 00:47:25 Downloading - Downloading input data...
[0m
[34m2020-04-15 00:47:48,312 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-04-15 00:47:48,321 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-04-15 00:47:48,718 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-04-15 00:47:48,734 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-04-15 00:47:48,749 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-04-15 00:47:48,760 sagemaker-containers INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_

In [23]:
import time

endpoint_name = 'tensorflow-sentiment-analysis'+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
end_point = tf_estimator.deploy(initial_instance_count=1,instance_type='ml.m5.4xlarge',endpoint_name=endpoint_name)

-------------!

In [27]:
print(end_point.endpoint)

tensorflow-sentiment-analysis2020-04-15-01-23-15


In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_texts(text):
    
    num_of_words = 5000
    token = Tokenizer(num_words=num_of_words)
    token.fit_on_texts(training_data.text.values)
    
    tweet_sequence = token.texts_to_sequences(text)
    
    max_len = 200
    padded_tweet_sequence = pad_sequences(tweet_sequence, maxlen=max_len)
    
    return padded_tweet_sequence

In [29]:
test_texts = preprocess_texts(test.text.values)

In [32]:
import numpy as np

for i, tweet in enumerate(test_texts):
    labels = ['positive','negative']
    print(test.text.values[i])
    prediction = end_point.predict(tweet)['predictions']
    prediction = np.array(prediction).round().item()
    print('Actual sentiment: {} ----- Predicted sentiment  {} \n'.format(test.airline_sentiment.values[i],labels[int(prediction)]))

@AmericanAir Flight Cancelled Flightled, can't go home until tomorrow. I could use dinner and a play, @AmericanAir! It's my first time in NYC.
Actual sentiment: negative ----- Predicted sentiment  negative 

Thank you. “@AmericanAir: @jlhalldc Customer Relations will review your concerns and contact you back directly, John.”
Actual sentiment: positive ----- Predicted sentiment  positive 

@AmericanAir How do I change my flight if the phone system keeps telling me that the representatives are busy?
Actual sentiment: negative ----- Predicted sentiment  negative 

@AmericanAir Thanks! He is.
Actual sentiment: positive ----- Predicted sentiment  positive 

@AmericanAir thx for nothing on getting us out of the country and back to US. Broken plane? Come on. Get another one.
Actual sentiment: negative ----- Predicted sentiment  negative 

@AmericanAir my flight was Cancelled Flightled, leaving tomorrow morning. Auto rebooked for a Tuesday night flight but need to arrive Monday.
Actual sentime

In [225]:
end_point.delete_endpoint()