In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM , Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

In [2]:
#loading the dataset
dataset = pd.read_csv("/content/drive/MyDrive/Tweets.csv")

In [3]:
dataset.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,retweet_count,text,tweet_created,latitude,longitude
0,5.7e+17,neutral,1.0,,,Virgin America,cairdin,0,@VirginAmerica What @dhepburn said.,24-02-2015 11:35,35.888455,-119.273781
1,5.7e+17,positive,0.3486,,0.0,Virgin America,jnardino,0,@VirginAmerica plus you've added commercials t...,24-02-2015 11:15,37.770971,-119.941025
2,5.7e+17,neutral,0.6837,,,Virgin America,yvonnalynn,0,@VirginAmerica I didn't today... Must mean I n...,24-02-2015 11:15,35.684863,-119.709299
3,5.7e+17,negative,1.0,Bad Flight,0.7033,Virgin America,jnardino,0,@VirginAmerica it's really aggressive to blast...,24-02-2015 11:15,37.061159,-119.279135
4,5.7e+17,negative,1.0,Can't Tell,1.0,Virgin America,jnardino,0,@VirginAmerica and it's a really big bad thing...,24-02-2015 11:14,36.790587,-120.867752


In [4]:
#to check which airlines reviews we have recived
dataset['airline'].unique()

array(['Virgin America', 'United', 'Southwest', 'Delta', 'US Airways',
       'American'], dtype=object)

In [5]:
#what are the sentiment or labels to predict
dataset['airline_sentiment'].unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [6]:
#checking for the null values
dataset.isnull().sum()

tweet_id                           0
airline_sentiment                  0
airline_sentiment_confidence       0
negativereason                  5462
negativereason_confidence       4118
airline                            0
name                               0
retweet_count                      0
text                               0
tweet_created                      0
latitude                           0
longitude                          0
dtype: int64

In [7]:
#creating a new dataframe with only 2 columns the review and the respected sentiment  
data = dataset[["text","airline_sentiment"]]

In [8]:
#new dataframe
data.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [9]:
#removing all the rows where ever the sentiment is neutral we want to work with positive and negative reviews
dataset = data[data.airline_sentiment != 'neutral']

In [10]:
dataset.head()

Unnamed: 0,text,airline_sentiment
1,@VirginAmerica plus you've added commercials t...,positive
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
5,@VirginAmerica seriously would pay $30 a fligh...,negative
6,"@VirginAmerica yes, nearly every time I fly VX...",positive


In [11]:
dataset.airline_sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [12]:
dataset['text'][3]

'@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse'

In [13]:
import nltk
from nltk.corpus import stopwords

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
#checking what all words are considered as stopwords
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [16]:
dataset.info

<bound method DataFrame.info of                                                     text airline_sentiment
1      @VirginAmerica plus you've added commercials t...          positive
3      @VirginAmerica it's really aggressive to blast...          negative
4      @VirginAmerica and it's a really big bad thing...          negative
5      @VirginAmerica seriously would pay $30 a fligh...          negative
6      @VirginAmerica yes, nearly every time I fly VX...          positive
...                                                  ...               ...
14633  @AmericanAir my flight was Cancelled Flightled...          negative
14634         @AmericanAir right on cue with the delays👌          negative
14635  @AmericanAir thank you we got on a different f...          positive
14636  @AmericanAir leaving over 20 minutes Late Flig...          negative
14638  @AmericanAir you have my money, you change my ...          negative

[11541 rows x 2 columns]>

In [17]:
#converting all the text to lower case
dataset['text'] = dataset['text'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
print(dataset['text'].iloc[3])

@virginamerica seriously would pay $30 a flight for seats that didn't have this playing.
it's really the only bad thing about flying va


In [20]:
#removing all the punctuations and symbols
dataset['text'] = dataset['text'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]','',x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
print(dataset['text'].iloc[3])

virginamerica seriously would pay 30 a flight for seats that didnt have this playing
its really the only bad thing about flying va


In [22]:
dataset.shape

(11541, 2)

In [23]:
#loading the stopwords
stop_words = set(stopwords.words("english"))

In [24]:
#removing the stop words from the corpus
dataset['text'] = dataset['text'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop_words)]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [25]:
print(dataset['text'].iloc[3])

virginamerica seriously would pay 30 flight seats didnt playing really bad thing flying va


In [26]:
#tokenizing our document 
max_features = 2000
tokenizer = Tokenizer(num_words=max_features,split = ' ')
tokenizer.fit_on_texts(dataset['text'].values)

In [27]:
print(dataset['text'].iloc[3])

virginamerica seriously would pay 30 flight seats didnt playing really bad thing flying va


In [28]:
#converting our tokens or text to sequences
seq = tokenizer.texts_to_sequences(dataset['text'].values)

In [29]:
len(seq[3])

14

In [30]:
print(seq[3])

[43, 283, 34, 185, 186, 2, 121, 104, 1980, 56, 115, 376, 78, 1807]


In [31]:
len(seq[12])

15

In [32]:
print(seq[12])

[43, 769, 88, 1190, 710, 2, 242, 683, 47, 434, 146, 46, 383, 927, 1265]


In [33]:
#Doing the padding operation so that all the documents are of same length 
seq = pad_sequences(seq)

In [34]:
print(seq[12])

[   0    0    0    0    0    0    0   43  769   88 1190  710    2  242
  683   47  434  146   46  383  927 1265]


In [35]:
print(seq[20])

[   0    0    0    0    0    0    0    0    0    0    0    0   43  296
 1073 1074 1652  321  315  165 1137  101]


In [36]:
seq.shape

(11541, 22)

In [37]:
#converting the label to numeric form
y = pd.get_dummies(dataset['airline_sentiment'],drop_first=True).values

In [38]:
y[3]

array([0], dtype=uint8)

In [39]:
#splitting the data into train and test
x_train,x_test,y_train,y_test = train_test_split(seq, y, test_size = 0.2,random_state = 0)

In [40]:
x_train.shape

(9232, 22)

In [41]:
x_train[3]


array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          1,   21,  698, 1485,   88,  239,    2, 1827,   46,   10, 1938],
      dtype=int32)

In [42]:
y_train[3]

array([1], dtype=uint8)

In [43]:
#giving the embedding dimension, the size of the dimension which we want to convert our word into
embed_dimension = 128

In [44]:
#Building the model 
model = Sequential()
model.add(Embedding(max_features,embed_dimension,input_length=seq.shape[1])) #Embedding layer - input_Dimension, embed_Dim we need, and the input length of the document
model.add(LSTM(128, dropout=0.2, recurrent_dropout= 0.2)) #buildinf a lstm layer 
model.add(Dense(1,activation = 'sigmoid')) #output layer

In [45]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 22, 128)           256000    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 387,713
Trainable params: 387,713
Non-trainable params: 0
_________________________________________________________________


In [46]:
#compiling the model
model.compile(loss = 'binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [47]:
#training our model and checking the validation accuracy
model.fit(x_train,y_train,batch_size=100,validation_data=(x_test,y_test),epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7eff33715910>

In [48]:
#predicting a random tweet 
tweets = ["virginamerica has the most incredible customer service i've ever experienced! so refreshing!"]

In [49]:
from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [50]:
#tokenized_twt = word_tokenize(tweet)

In [51]:
#tokenizing the tweet
tokenizer.fit_on_texts(tweets)

In [52]:
#removing stop words from the tweet
filtered_twts  =[]
for word in tweets:
  if word not in stop_words:
    filtered_twts.append(word)

In [53]:
filtered_twts

["virginamerica has the most incredible customer service i've ever experienced! so refreshing!"]

In [54]:
#converting text to sequence
seq1 = tokenizer.texts_to_sequences(filtered_twts)

In [55]:
seq1

[[43, 1286, 11, 10, 77, 898]]

In [57]:
print(seq1)

[[43, 1286, 11, 10, 77, 898]]


In [58]:
#padding our document according to shape of our data
seq1 = pad_sequences(seq1, maxlen=22)

In [59]:
seq1

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   43, 1286,   11,   10,   77,  898]],
      dtype=int32)

In [60]:
#predicting the sentiment of the tweet
y_pred = model.predict_classes(seq1)



In [61]:
y_pred

array([[0]], dtype=int32)