In [20]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, to_categorical
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

SEED = 57
dim_lstm = 256
dim_embed = 256
traning_batch = 256

In [2]:
df_tr = pd.read_csv("train.csv")
df_vl = pd.read_csv("valid.csv")
df_tr.info(), df_vl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130375 entries, 0 to 130374
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  130375 non-null  object 
 1   category    130375 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32594 entries, 0 to 32593
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   clean_text  32594 non-null  object 
 1   category    32594 non-null  float64
dtypes: float64(1), object(1)
memory usage: 509.4+ KB


(None, None)

In [3]:
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(df_tr.clean_text.values)

In [4]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

98419

In [5]:
x_tr = tokenizer.texts_to_sequences(df_tr.clean_text.values)
x_vl = tokenizer.texts_to_sequences(df_vl.clean_text.values)

In [6]:
max_len = max([len(e) for e in x_tr])
max_len

52

In [7]:
x_tr = pad_sequences(x_tr, maxlen=max_len, padding="post", truncating="post")
x_vl = pad_sequences(x_vl, maxlen=max_len, padding="post", truncating="post")
x_tr.shape, x_vl.shape

((130375, 52), (32594, 52))

In [8]:
x_tr[1]

array([  99, 2342,    3,   46, 2293,  255,   84,    3, 2743,    3,   59,
        106,   32,   84,  153,  703,  564,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [9]:
y_tr = df_tr.category.apply(lambda x: int(x) + 1).values
y_vl = df_vl.category.apply(lambda x: int(x) + 1).values
y_tr = to_categorical(y_tr, num_classes=3)
y_vl = to_categorical(y_vl, num_classes=3)
y_tr.shape, y_vl.shape

((130375, 3), (32594, 3))

In [10]:
model = models.Sequential([
    layers.Input(shape=(max_len,)),
    layers.Embedding(input_dim=vocab_size, output_dim=dim_embed),
    layers.LSTM(units=dim_lstm),
    layers.Dense(3, activation='softmax')
])

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 52, 256)           25195264  
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 dense (Dense)               (None, 3)                 771       
                                                                 
Total params: 25,721,347
Trainable params: 25,721,347
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
model.fit(x_tr, y_tr, validation_data=(x_vl, y_vl), epochs=5, batch_size=traning_batch, callbacks=[EarlyStopping(patience=3)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f23a246ad90>

In [30]:
id2sentiment = {
    0: "negative",
    1: "neutral",
    2: "positive" 
}

def predict_sentiment(text):
  x = tokenizer.texts_to_sequences([text])
  x = pad_sequences(x, maxlen=max_len, padding="post", truncating="post")
  prob = model.predict(x)[0]
  id = np.argmax(prob)
  sentiment = id2sentiment[id]
  return sentiment

In [31]:
predict_sentiment("why are you complaining modi busy helping and start new businesses has time and money left actually has little money left but needs for adverts about himself")



'positive'

In [32]:
	predict_sentiment("for baby with constipation use betel leaf stems inserted into anus how about elders who suffer constipation because modi see the same treatment done here publicly  two videos")



'neutral'