In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

stopwords = nltk.corpus.stopwords.words('english')

messages = pd.read_csv('C:/Users/shubham/Downloads/Virgin Use Case/Usecase3_Dataset.csv')
messages.head(7)

In [None]:
# Cleaning the Data
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.sub('@[A-Za-z0-9_]+','', text)
    tokens = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text)
    
    tokens = re.split('\W+', text)
        
    
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

messages['clean_text'] = messages['text'].apply(lambda x: clean_text(x))
messages.head()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [None]:
messages['airline_sentiment_encoded'] = le.fit_transform(messages['airline_sentiment'])
messages.head()

In [None]:
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(messages['clean_text'],
                                                    messages['airline_sentiment_encoded'], test_size=0.3, random_state = 99)

y_train.head(5)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Use that tokenizer to transform the text messages in the training and test sets
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences so each sequence is the same length
X_train_seq_padded = pad_sequences(X_train_seq, 100)
X_test_seq_padded = pad_sequences(X_test_seq, 100)

In [None]:
# Import the tools needed from keras and define functions to calculate recall and precision
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

In [None]:
# Construct a simple RNN model
model = Sequential()

model.add(Embedding(len(tokenizer.index_word)+1, 32))
model.add(LSTM(32, dropout=0, recurrent_dropout=0))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.summary()

In [None]:
# Compile the model
from keras.optimizers import SGD
opt = SGD(lr=0.05, momentum=0.9)
model.compile(optimizer='adam',
              loss='kullback_leibler_divergence',
              metrics=['accuracy', precision_m, recall_m])

In [None]:
# Fit the RNN model
from keras.utils import to_categorical
y_train_binary = to_categorical(y_train)
y_test_binary = to_categorical(y_test)
history = model.fit(X_train_seq_padded, y_train_binary, 
                    batch_size=50, epochs=50,
                    validation_data=(X_test_seq_padded, y_test_binary))

In [None]:
predictions = model.predict(X_test_seq_padded)