In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dropout, Dense, Input, concatenate, GRU
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

In [2]:
# Load data
data = pd.read_csv("/content/pre_processed_final.csv")
print(data.shape)

(28777, 3)


In [3]:
data = data[["sentiment", "text"]]
df = data.dropna()
df.head()

Unnamed: 0,sentiment,text
0,1.0,is lookin 4ward to a long weekend really dont...
1,1.0,myweakness is music and i live to meet the pe...
2,1.0,figured out the internet on my new ipod
3,1.0,cant wait to worship with you guys tonight itl...
4,1.0,congrats james im sure the book is going to b...


In [4]:
# Preprocessing
df['sentiment'] = df['sentiment'].replace({1: 0, 0: 1, -1: 2})
sentences = df['text'].tolist()
labels = df['sentiment'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['sentiment'].replace({1: 0, 0: 1, -1: 2})


In [5]:
# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 30652 unique tokens.


In [6]:
# Convert text data to sequence
X = tokenizer.texts_to_sequences(sentences)
X = pad_sequences(X, padding='post', maxlen=100)

In [7]:
# Convert labels to categorical format
y = tf.keras.utils.to_categorical(labels, num_classes=3)

In [8]:
# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Define model architecture
inputs = Input(shape=(100,))
embedding = Embedding(input_dim=5000, output_dim=100, input_length=100)(inputs)
conv1 = Conv1D(filters=128, kernel_size=3, padding='valid', activation='relu', strides=1)(embedding)
pool1 = MaxPooling1D(pool_size=2)(conv1)
conv2 = Conv1D(filters=64, kernel_size=3, padding='valid', activation='relu', strides=1)(pool1)
pool2 = MaxPooling1D(pool_size=2)(conv2)
gru = GRU(64)(pool2)
dense1 = Dense(64, activation='relu')(gru)
dropout = Dropout(0.5)(dense1)
outputs = Dense(3, activation='softmax')(dropout)

In [10]:
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 100)          500000    
                                                                 
 conv1d (Conv1D)             (None, 98, 128)           38528     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 49, 128)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 47, 64)            24640     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 23, 64)           0         
 1D)                                                         

In [11]:
# Train model
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fdcb936fc70>

In [13]:
from sklearn.metrics import classification_report

# Predict on test data
predictions = model.predict(X_test)
predicted_labels = np.argmax(predictions, axis=1)
print(predicted_labels)
print(classification_report(y_test.argmax(axis=1), predicted_labels))

[0 0 1 ... 0 0 1]
              precision    recall  f1-score   support

           0       0.66      0.72      0.69      2049
           1       0.60      0.61      0.61      1827
           2       0.74      0.65      0.69      1804

    accuracy                           0.66      5680
   macro avg       0.67      0.66      0.66      5680
weighted avg       0.67      0.66      0.66      5680



In [15]:
import pickle

# Save the model using pickle
with open('cnn.pkl', 'wb') as file:
    pickle.dump(model, file)