In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load train data
train_data = pd.read_csv('train.csv')

In [3]:
train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [5]:
# Load test data
test_data = pd.read_csv('test.csv')

In [6]:
test_data.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [7]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3534 entries, 0 to 3533
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   textID     3534 non-null   object
 1   text       3534 non-null   object
 2   sentiment  3534 non-null   object
dtypes: object(3)
memory usage: 83.0+ KB


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [9]:
# Replace missing values with empty strings
train_data['text'].fillna('', inplace=True)
test_data['text'].fillna('', inplace=True)

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data['text'], train_data['sentiment'], random_state=42, test_size=0.2
)

# Convert text data to numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)

# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, train_labels)

# Make predictions on the validation set
predictions = classifier.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(val_labels, predictions)
print(f'Accuracy: {accuracy:.2f}')

# Print classification report
print(classification_report(val_labels, predictions))

# Now, you can use the trained model to predict sentiments on the test data
X_test = vectorizer.transform(test_data['text'])
test_predictions = classifier.predict(X_test)

# Add predictions to the test_data DataFrame
test_data['predicted_sentiment'] = test_predictions

# Save the results to a new CSV file
test_data.to_csv('sentiment_predictions.csv', index=False)

Accuracy: 0.65
              precision    recall  f1-score   support

    negative       0.69      0.58      0.63      1562
     neutral       0.59      0.69      0.63      2230
    positive       0.72      0.67      0.69      1705

    accuracy                           0.65      5497
   macro avg       0.67      0.64      0.65      5497
weighted avg       0.66      0.65      0.65      5497



In [12]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
# Tokenize and pad the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Pad sequences to a fixed length
max_length = 128
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Define a simple sentiment analysis model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(3, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare labels
train_labels = train_data['sentiment'].astype('category').cat.codes

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_padded, train_labels, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))

# Make predictions on the test data
test_predictions = model.predict(test_padded)
test_labels_predicted = tf.argmax(test_predictions, axis=1)

# Map predicted labels back to sentiment categories
predicted_sentiments = test_labels_predicted.numpy()

# Add predictions to the test_data DataFrame
test_data['predicted_sentiment'] = predicted_sentiments

# Save the results to a new CSV file
test_data.to_csv('test_sentiment_predictions.csv', index=False)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [14]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['text'])

# Pad sequences to a fixed length
max_length = 128
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')

# Define a simple sentiment analysis model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(3, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare labels
train_labels = train_data['sentiment'].astype('category').cat.codes

# Train the model
model.fit(train_padded, train_labels, epochs=3, batch_size=32)

# Save the trained model
model.save('sentiment_analysis_model.h5')

Epoch 1/3
Epoch 2/3
Epoch 3/3


  saving_api.save_model(


In [15]:
# Load the saved model
loaded_model = tf.keras.models.load_model('sentiment_analysis_model.h5')

In [16]:
# Tokenize and pad the train data
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')

# Use the loaded model to predict sentiments for train data
train_predictions = loaded_model.predict(train_padded)
train_predicted_labels = tf.argmax(train_predictions, axis=1).numpy()

# Map predicted labels back to sentiment categories
train_sentiment_categories = ["negative", "neutral", "positive"]
train_predicted_sentiments = [train_sentiment_categories[label] for label in train_predicted_labels]

# Add predicted sentiments to the train_data DataFrame
train_data['predicted_sentiment'] = train_predicted_sentiments

# Display the train_data DataFrame with predicted sentiments
print(train_data[['text', 'predicted_sentiment']])

# Tokenize and pad the test data
test_sequences = tokenizer.texts_to_sequences(test_data['text'])
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Use the loaded model to predict sentiments for test data
test_predictions = loaded_model.predict(test_padded)
test_predicted_labels = tf.argmax(test_predictions, axis=1).numpy()

# Map predicted labels back to sentiment categories
test_sentiment_categories = ["negative", "neutral", "positive"]
test_predicted_sentiments = [test_sentiment_categories[label] for label in test_predicted_labels]

# Add predicted sentiments to the test_data DataFrame
test_data['predicted_sentiment'] = test_predicted_sentiments

# Display the test_data DataFrame with predicted sentiments
print(test_data[['text', 'predicted_sentiment']])

                                                    text predicted_sentiment
0                    I`d have responded, if I were going             neutral
1          Sooo SAD I will miss you here in San Diego!!!            negative
2                              my boss is bullying me...            negative
3                         what interview! leave me alone             neutral
4       Sons of ****, why couldn`t they put them on t...             neutral
...                                                  ...                 ...
27476   wish we could come see u on Denver  husband l...             neutral
27477   I`ve wondered about rake to.  The client has ...             neutral
27478   Yay good for both of you. Enjoy the break - y...            positive
27479                         But it was worth it  ****.             neutral
27480     All this flirting going on - The ATG smiles...            positive

[27481 rows x 2 columns]
                                                  

In [17]:
# Another example of new text data
new_text_data = ["I love to eat", "This is disappointing."]

# Tokenize and pad the new text data
new_sequences = tokenizer.texts_to_sequences(new_text_data)
new_padded = pad_sequences(new_sequences, maxlen=max_length, padding='post', truncating='post')

# Use the loaded model to predict sentiments
predictions = loaded_model.predict(new_padded)
predicted_labels = tf.argmax(predictions, axis=1).numpy()

# Map predicted labels back to sentiment categories
sentiment_categories = ["negative", "neutral", "positive"]
predicted_sentiments = [sentiment_categories[label] for label in predicted_labels]

# Display the predicted sentiments for each input text
for text, sentiment in zip(new_text_data, predicted_sentiments):
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")

Text: I love to eat
Predicted Sentiment: positive

Text: This is disappointing.
Predicted Sentiment: negative

