In [1]:
#make sure you have these versions of TensorFlow and Keras

In [2]:
import tensorflow as tf
print(tf.__version__)

2.15.0


In [3]:
import keras
keras.__version__

'2.15.0'

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
train=pd.read_csv('/content/drive/Shareddrives/ presentation/train.csv')
val=pd.read_csv('/content/drive/Shareddrives/ presentation/val.csv')
test=pd.read_csv('/content/drive/Shareddrives/ presentation/test.csv')





In [6]:
val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  999 non-null    object
 1   content    999 non-null    object
dtypes: object(2)
memory usage: 15.7+ KB


In [7]:
data = pd.concat([train ,  val , test])
data.columns = [ "sentiment","content"]

In [8]:
unique_sentiments = data['sentiment'].unique()
print(unique_sentiments)

['neutral' 'worry' 'surprise' 'hate' 'empty' 'happiness' 'sadness' 'fun'
 'enthusiasm' 'love' 'relief' 'boredom' 'anger']


In [9]:
sentiment_mapping = {
    'empty': 'negative',
    'sadness': 'negative',
    'worry': 'negative',
    'hate': 'negative',
    'boredom': 'negative',
    'anger': 'negative',
    'enthusiasm': 'positive',
    'neutral': 'neutral',
    'surprise': 'neutral',
    'love': 'positive',
    'fun': 'positive',
    'happiness': 'positive',
    'relief': 'positive'
}

# Create a new col for the mapping
data['label'] = data['sentiment'].map(sentiment_mapping)

In [10]:
# Drop the original column, 'sentiment'
data = data.drop(columns=['sentiment'])

In [11]:
unique_sentiments = data['label'].unique()
print(unique_sentiments)

['neutral' 'negative' 'positive']


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 0 to 1998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  40000 non-null  object
 1   label    40000 non-null  object
dtypes: object(2)
memory usage: 937.5+ KB


In [13]:
data.rename(columns={'content': 'text'}, inplace=True)
#data.rename(columns={'sentiment': 'label'}, inplace=True)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 0 to 1998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  object
dtypes: object(2)
memory usage: 937.5+ KB


In [15]:
data.shape

(40000, 2)

In [16]:
data.isna().any(axis=1).sum()

0

In [17]:
#text preprocessing
ps = PorterStemmer()

def preprocess(line):
    review = re.sub('[^a-zA-Z]', ' ', line) #leave only characters from a to z
    review = review.lower() #lower the text
    review = review.split() #turn string into list of words
    #apply Stemming
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] #delete stop words like I, and ,OR   review = ' '.join(review)
    #trun list into sentences
    return " ".join(review)


In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
data['text']=data['text'].apply(lambda x: preprocess(x))


In [20]:
data['label']
#data['text']

0        neutral
1        neutral
2       negative
3       negative
4        neutral
          ...   
1994    negative
1995    negative
1996     neutral
1997     neutral
1998    negative
Name: label, Length: 40000, dtype: object

In [21]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
data['N_label'] = label_encoder.fit_transform(data['label'])


In [22]:
data['text']

0       came hous morn stink cat poo cat busi littl bi...
1       bah bk recept comp aint work sun got hat hip h...
2       bbc malaria parasit becom resist drug http tru...
3       morn appear bit sore head perhap bag pork scra...
4                book appoint give blood pm nd june scare
                              ...                        
1994          think hire one tranlsat one ever understand
1995                     http twitpic com w see mile away
1996                              desper go venic beinnal
1997                                  sleep work tomorrow
1998               ouch epic bruis toe lump ankl bad time
Name: text, Length: 40000, dtype: object

In [23]:
# Creating the Bag of Words model by applying Countvectorizer -convert textual data to numerical data


from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000,ngram_range=(1,3))#example: the course was long-> [the,the course,the course was,course, course was, course was long,...]

data_cv = cv.fit_transform(data['text']).toarray()




In [24]:
data_cv

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [25]:
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test=data_cv,test_cv,train['N_label'],test['N_label']
X_train, X_test, y_train, y_test =train_test_split(data_cv, data['N_label'], test_size=0.25, random_state=42)


In [27]:
# first neural network with keras tutorial
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import f1_score
import numpy as np
# load the dataset
# split into input (X) and output (y) variables
# define the keras model
model = Sequential()
model.add(Dense(12, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(3, activation='softmax'))
# compile the keras model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train, y_train, epochs=10, batch_size=600)
# evaluate the keras model
_, accuracy = model.evaluate(X_train, y_train)
print('Accuracy: %.2f' % (accuracy*100))

y_pred_train = np.argmax(model.predict(X_train), axis=1)
f1_train = f1_score(y_train, y_pred_train, average='micro')
print('F1 Score on Training Set: %.4f' % f1_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 72.71
F1 Score on Training Set: 0.7271


In [28]:
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 55.52


In [29]:
import numpy as np
text='You are a good boy'
text=preprocess(text)
array = cv.transform([text]).toarray()
pred = model.predict(array)
a=np.argmax(pred, axis=1)
label_encoder.inverse_transform(a)[0]



'positive'

In [None]:
tf.keras.models.save_model(model,'my_model.h5')


In [31]:
import pickle
pickle.dump(label_encoder, open('encoder.pkl', 'wb'))
pickle.dump(cv, open('CountVectorizer.pkl', 'wb'))


In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming X_test and y_test are your test dataset
# Evaluate the model on the test set
_, test_accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy: %.2f' % (test_accuracy * 100))

# Make predictions on the test set
y_pred_test = np.argmax(model.predict(X_test), axis=1)

# Calculate accuracy, precision, recall, and F1 score
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='micro')
test_recall = recall_score(y_test, y_pred_test, average='micro')
test_f1 = f1_score(y_test, y_pred_test, average='micro')

print('Test Accuracy: %.2f' % (test_accuracy * 100))
print('Test Precision: %.2f' % (test_precision* 100))
print('Test Recall: %.2f' % (test_recall* 100))
print('Test F1 Score: %.4f' % (test_f1* 100))


Test Accuracy: 55.52
Test Accuracy: 55.52
Test Precision: 55.52
Test Recall: 55.52
Test F1 Score: 55.5200
