# Importing Required Packages

In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Flatten


# Importing data and preprocessing

In [3]:
df=pd.read_csv("C:/Users/asus/Desktop/train.csv",encoding='ISO-8859-1')

In [4]:
df

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26
...,...,...,...,...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,night,31-45,Ghana,31072940,227540.0,137
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,morning,46-60,Greece,10423054,128900.0,81
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,noon,60-70,Grenada,112523,340.0,331
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,night,70-100,Guatemala,17915568,107160.0,167


In [5]:
df1=['text','sentiment']

In [6]:
df2=df[df1]
df2

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive


In [7]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))#removes common word like 'and','the','is' etc.. 


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text


In [9]:
# Fill any missing values with an empty string using .loc
df2.loc[:, 'text'] = df2['text'].fillna('')

# Apply preprocessing using .loc
df2.loc[:, 'text'] = df2['text'].astype(str).apply(preprocess_text)

# Check the first few rows
print(df2.head())


                                       text sentiment
0                        id responded going   neutral
1                   sooo sad miss san diego  negative
2                             boss bullying  negative
3                     interview leave alone  negative
4  sons couldnt put releases already bought  negative


In [10]:
df2

Unnamed: 0,text,sentiment
0,id responded going,neutral
1,sooo sad miss san diego,negative
2,boss bullying,negative
3,interview leave alone,negative
4,sons couldnt put releases already bought,negative
...,...,...
27476,wish could come see u denver husband lost job ...,negative
27477,ive wondered rake client made clear net dont f...,negative
27478,yay good enjoy break probably need hectic week...,positive
27479,worth,positive


# Tokenizing and Padding

In [11]:
# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df2['text'])
sequences = tokenizer.texts_to_sequences(df2['text'])

# Padding
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Check the result
print(padded_sequences[:5])

[[ 197    1   11    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]
 [ 312   52   36 1344 2109    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]
 [1182    1    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]
 [ 998  245  390    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]
 [2640  286  224    1  119  448    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]


# Encoding

In [12]:
from sklearn.preprocessing import LabelEncoder

# Label encoding
label_encoder = LabelEncoder()
df2['sentiment'] = label_encoder.fit_transform(df2['sentiment'])

# Check the encoded labels
print(df2['sentiment'].unique())



[1 0 2]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['sentiment'] = label_encoder.fit_transform(df2['sentiment'])


# Model Building

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

# Model parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 16
input_length = padded_sequences.shape[1]

# Building the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))  # 3 output units for 3 classes

# Compiling the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()




# Train the Model


In [14]:
# Training the model
history = model.fit(padded_sequences, df2['sentiment'], epochs=10, validation_split=0.2, batch_size=32)


Epoch 1/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.5023 - loss: 0.9729 - val_accuracy: 0.6880 - val_loss: 0.7262
Epoch 2/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7527 - loss: 0.6152 - val_accuracy: 0.6922 - val_loss: 0.7270
Epoch 3/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.8411 - loss: 0.4371 - val_accuracy: 0.6773 - val_loss: 0.8319
Epoch 4/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.9043 - loss: 0.2835 - val_accuracy: 0.6593 - val_loss: 1.0356
Epoch 5/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9414 - loss: 0.1852 - val_accuracy: 0.6494 - val_loss: 1.2874
Epoch 6/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9560 - loss: 0.1371 - val_accuracy: 0.6413 - val_loss: 1.5363
Epoch 7/10
[1m687/687[0m 

# Evaluate the Model


In [16]:
# Evaluate the model on the training data
loss, accuracy = model.evaluate(padded_sequences, df2['sentiment'], verbose=0)

# Print the accuracy
print(f"Training Accuracy: {accuracy * 100:.2f}%")


Training Accuracy: 91.07%


# Prediction

In [17]:
# Example text for prediction
new_text = ["I love this product!"]
new_seq = tokenizer.texts_to_sequences(new_text)
new_pad = pad_sequences(new_seq, maxlen=max_length, padding='post', truncating='post')

# Prediction
prediction = model.predict(new_pad)
predicted_label = label_encoder.inverse_transform([prediction.argmax(axis=1)[0]])
print(predicted_label)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step
['positive']
