In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
df = pd.read_csv('../../data/100_sentiment_analysis_sentences.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    100 non-null    object
 1   label   100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [6]:
# replacing values
df['label'].replace(['POSITIVE', 'NEGATIVE', 'NEUTRAL'],
                        [2, 0,1], inplace=True)

In [7]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

# We transform each complaint into a vector
features = tfidf.fit_transform(df.text)

labels = df.label

print("Each of the %d label is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

Each of the 100 label is represented by 61 features (TF-IDF score of unigrams and bigrams)


In [8]:
df_tfidf = pd.DataFrame(features.toarray(), columns=tfidf.get_feature_names(), index=df.index)
df_tfidf['label'] = df['label']
df_tfidf.head()
df_tfidf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 62 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   2030               100 non-null    float64
 1   2035               100 non-null    float64
 2   access             100 non-null    float64
 3   access reuters     100 non-null    float64
 4   agency             100 non-null    float64
 5   battery            100 non-null    float64
 6   black              100 non-null    float64
 7   brand              100 non-null    float64
 8   california         100 non-null    float64
 9   car                100 non-null    float64
 10  carb               100 non-null    float64
 11  com                100 non-null    float64
 12  com register       100 non-null    float64
 13  company            100 non-null    float64
 14  department         100 non-null    float64
 15  did                100 non-null    float64
 16  electric           100 non-

In [9]:
features.shape

(100, 61)

In [10]:
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Bidirectional,Dropout,LSTM

In [11]:
# define the keras model
model = Sequential()
model.add(Dense(12, input_shape=(features.shape[1],), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(3, activation='softmax'))
# compile the keras model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [12]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 12)                744       
                                                                 
 dense_1 (Dense)             (None, 8)                 104       
                                                                 
 dense_2 (Dense)             (None, 3)                 27        
                                                                 
Total params: 875
Trainable params: 875
Non-trainable params: 0
_________________________________________________________________


In [13]:
type(features)

scipy.sparse._csr.csr_matrix

In [14]:
# fit the keras model on the dataset
model.fit(features.toarray(), labels, epochs=10, batch_size=10)
# evaluate the keras model
_, accuracy = model.evaluate(features.toarray(), labels)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 59.00


In [15]:
model.predict(features[0].toarray())



array([[0.3192676 , 0.30794823, 0.37278417]], dtype=float32)

In [16]:
features[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.61294011, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70746152, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.35185596, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [17]:
import pickle
model.save("../models/tfidf_custom_dl_keras")
pickle.dump(tfidf, open("../models/tfidf_vectorizer_custom_dl_keras.pkl", "wb"))

INFO:tensorflow:Assets written to: ../models/tfidf_custom_dl_keras\assets


In [18]:
import keras
saved_model = keras.models.load_model("../models/tfidf_custom_dl_keras")
saved_tfidf =  pickle.load(open('../models/tfidf_vectorizer_custom_dl_keras.pkl', 'rb'))

In [19]:
#Inference Code
test_input = saved_tfidf.transform(["test_input"]).toarray()
new_prediction = saved_model.predict(test_input)
print(new_prediction)
predicted_class_id = new_prediction.argmax().item()
print(predicted_class_id)

[[0.32634345 0.33264267 0.34101385]]
2
