In [1]:
#import pandas to read data
import pandas as pd

In [2]:
#read data
data = pd.read_csv('twitter_training.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
#checking for null values
data.isnull().sum()

0      0
1      0
2      0
3    686
dtype: int64

In [4]:
#deleting null values
data.dropna(inplace=True)
data.isnull().sum()
data.shape

(73996, 4)

In [5]:
#value counts of all types
data[2].value_counts()

Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: 2, dtype: int64

In [6]:
#deleting reviews which are irrelevant as it won't help in sentiment analysis
data = data[data[2]!='Irrelevant']
data[2].value_counts()

Negative    22358
Positive    20655
Neutral     18108
Name: 2, dtype: int64

In [7]:
#dropping columns 0 and 1, as they won't help in sentiment analysis
data.drop(columns=[0,1], inplace=True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,2,3
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [8]:
#cleaning data
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
#removing not, no, aren't, ... type of words.
stopwords = stopwords.words('english')
stopwords.remove('not')
stopwords.remove('no')
for i in stopwords:
  x = re.search("n't$", i)
  if x is not None:
    stopwords.remove(i)
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [10]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
reviews = []
max_words_len = 0
Lemmatizer = WordNetLemmatizer()
def clean_sentence(sentence):
  global max_words_len
  review = re.sub('[^a-zA-Z]', ' ',sentence)
  review = review.lower()
  review = review.split()
  review = [Lemmatizer.lemmatize(word) for word in review if not word in stopwords]
  max_words_len = max(len(review), max_words_len)
  review = ' '.join(review)
  return review

for sentence in data[3]:
  reviews.append(clean_sentence(sentence))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [11]:
print(data[3].index)

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            74672, 74673, 74674, 74675, 74676, 74677, 74678, 74679, 74680,
            74681],
           dtype='int64', length=61121)


In [12]:
print(len(reviews))

61121


In [13]:
X = pd.DataFrame(reviews)
X.head()
# X.shape

Unnamed: 0,0
0,im getting borderland murder
1,coming border kill
2,im getting borderland kill
3,im coming borderland murder
4,im getting borderland murder


In [14]:
#one hot encoding for sentiments and creating new dataset
y = pd.get_dummies(data[2])
y.head()
y.shape

(61121, 3)

In [15]:
#tokenizing and padding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X[0].values)
sequences = tokenizer.texts_to_sequences(X[0].values)
word_index = tokenizer.word_index
X = pad_sequences(sequences, maxlen=max_words_len, padding='pre')

In [16]:
# splitting the data for training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
X_train.shape,X_test.shape

((48896, 163), (12225, 163))

In [17]:
# Deep Learning Model Creation
from keras.models import Sequential
from keras.layers import Dense, Embedding, SpatialDropout1D,LSTM
model = Sequential()

In [18]:
n_most_words = len(word_index)+1
n_dim = max_words_len + 10
model.add(Embedding(n_most_words,n_dim,input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(75,dropout=0.25,recurrent_dropout=0.25))
model.add(Dense(25,activation='relu'))
model.add(Dense(3,activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 163, 173)          3994743   
                                                                 
 spatial_dropout1d (SpatialD  (None, 163, 173)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 75)                74700     
                                                                 
 dense (Dense)               (None, 25)                1900      
                                                                 
 dense_1 (Dense)             (None, 3)                 78        
                                                                 
Total params: 4,071,421
Trainable params: 4,071,421
Non-trainable params: 0
______________________________________________

In [19]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
checkpoint = ModelCheckpoint('weights.hdf5', monitor='val_loss', save_best_only=True)
early_stop = EarlyStopping(patience=5)
model_history = model.fit(X_train,y_train,epochs=30,validation_split = 0.1, batch_size = 128 ,callbacks=[checkpoint,early_stop])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30


In [20]:
# checking the model loss and accuracy
model.load_weights('weights.hdf5')
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.293
  Accuracy: 0.893


In [21]:
import numpy as np
def find_sentiment(review):
    seq = tokenizer.texts_to_sequences(review)
    padded = pad_sequences(seq, maxlen=max_words_len)
    pred = model.predict(padded)
    label = ['Negative','Neutral','Positive']
    return label[np.argmax(pred)]
s = input()
print(find_sentiment([s]))

I am confused
Negative


In [22]:
import pickle

In [23]:
pickle.dump(model,open('sentiment_model.pkl','wb'))



In [24]:
pickle.dump(tokenizer,open('tokenizer.pkl','wb'))