In [10]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import emoji

# Download stopwords
nltk.download('stopwords')

# Sample data
tweets = [
    "I love this movie! #awesome 😊",
    "This is a terrible movie... :( http://example.com",
    "Just watched the new movie @user 🎥"
]

# Function to convert emojis to text
def convert_emojis(text):
    return emoji.demojize(text)

# Preprocessing function
def preprocess_tweet(tweet):
    # Convert emojis to text
    tweet = convert_emojis(tweet)
    # Remove URLs
    tweet = re.sub(r"http\S+", "", tweet)
    # Remove mentions
    tweet = re.sub(r"@\S+", "", tweet)
    # Remove hashtags
    tweet = re.sub(r"#\S+", "", tweet)
    # Remove special characters and numbers
    tweet = re.sub(r"[^A-Za-z\s]", "", tweet)
    # Convert to lowercase
    tweet = tweet.lower()
    # Tokenize and remove stopwords
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(tweet)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

# Preprocess tweets
cleaned_tweets = [preprocess_tweet(tweet) for tweet in tweets]

# Tokenization
vocab_size = 5000
max_length = 50
embedding_dim = 100
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(cleaned_tweets)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(cleaned_tweets)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Display preprocessed tweets
print("Cleaned Tweets:", cleaned_tweets)
print("Padded Sequences:", padded_sequences)

Cleaned Tweets: ['love movie smilingfacewithsmilingeyes', 'terrible movie', 'watched new movie moviecamera']
Padded Sequences: [[3 2 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [5 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [6 7 2 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrikantvarma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import emoji
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

# Download stopwords
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Select necessary columns
df = df[['text', 'target']]

# Convert target labels to binary (0 = negative, 4 = positive)
df['target'] = df['target'].replace(4, 1)

# Function to convert emojis to text
def convert_emojis(text):
    return emoji.demojize(text)

# Preprocessing function
def preprocess_tweet(tweet):
    # Convert emojis to text
    tweet = convert_emojis(tweet)
    # Remove URLs
    tweet = re.sub(r"http\S+", "", tweet)
    # Remove mentions
    tweet = re.sub(r"@\S+", "", tweet)
    # Remove hashtags
    tweet = re.sub(r"#\S+", "", tweet)
    # Remove special characters and numbers
    tweet = re.sub(r"[^A-Za-z\s:]", "", tweet)
    # Convert to lowercase
    tweet = tweet.lower()
    # Tokenize and remove stopwords
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(tweet)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

# Preprocess tweets
df['cleaned_text'] = df['text'].apply(preprocess_tweet)

# Tokenization
vocab_size = 10000
max_length = 100
embedding_dim = 100
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df['cleaned_text'])
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['target'], test_size=0.2, random_state=42)

# Model
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)),
    Dense(24, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=3, validation_data=(X_test, y_test), batch_size=128, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f'Accuracy: {accuracy}')

# Predict on new data
sample_tweets = ["I love this product!", "This is the worst service ever.", "The new functionality to search is pretty cool"]
sample_sequences = tokenizer.texts_to_sequences([preprocess_tweet(tweet) for tweet in sample_tweets])
sample_padded = pad_sequences(sample_sequences, maxlen=max_length, padding='post')
predictions = model.predict(sample_padded)

print("Predictions:", predictions)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrikantvarma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 bidirectional (Bidirectiona  (None, 100, 128)         84480     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 24)                1560      
                                                                 
 dropout (Dropout)           (None, 24)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 2

2024-05-27 14:13:04.683854: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/3
Epoch 3/3
10000/10000 - 265s - loss: 0.4340 - accuracy: 0.7972 - 265s/epoch - 26ms/step
Accuracy: 0.7972375154495239
Predictions: [[0.97376347]
 [0.02353369]]


In [14]:
model.save('sentiment_analysis_model.h5')

In [16]:
# Predict on new data
sample_tweets = ["I love this product!", "This is the worst service ever.", "The new functionality to search is pretty cool","I hate it"]
sample_sequences = tokenizer.texts_to_sequences([preprocess_tweet(tweet) for tweet in sample_tweets])
sample_padded = pad_sequences(sample_sequences, maxlen=max_length, padding='post')
predictions = model.predict(sample_padded)

print("Predictions:", predictions)

Predictions: [[0.97376347]
 [0.02353368]
 [0.9787137 ]
 [0.10517763]]


In [17]:
sample_tweets = ["The movie was very touching and heart whelming", 
            "I have never seen an awesome movie like this", 
            "the movie plot was great but it had terrible acting",
           "I thought the movie would be boring",
           "I thought the movie would be boring, but it was surprisingly good!",
               "The movie was an absolute masterpiece, with stunning visuals and a compelling storyline.",
    "I loved every minute of the film; the acting was top-notch, and the soundtrack was beautiful.",
    "The plot was incredibly dull, and the characters were poorly developed. A complete waste of time.",
    "I found the movie to be a predictable and uninspired rehash of better films.",
    "The movie had an interesting concept, but the execution fell flat, and the pacing was all over the place.",
    "While the cinematography was breathtaking, the plot was convoluted and hard to follow.",
    "This film exceeded all my expectations with its brilliant script and stellar performances. A must-watch!",
    "An emotionally gripping story that kept me on the edge of my seat. Highly recommend it.",
    "The special effects were outdated, and the dialogue was cringeworthy. I couldn't wait for it to end.",
    "Despite a few decent scenes, the overall experience was boring and forgettable.",
    "The first half of the movie was engaging, but it lost momentum in the second half and ended on a weak note.",
    "I appreciated the artistic direction, but the acting was subpar, and the story lacked coherence."
]
sample_sequences = tokenizer.texts_to_sequences([preprocess_tweet(tweet) for tweet in sample_tweets])
sample_padded = pad_sequences(sample_sequences, maxlen=max_length, padding='post')
predictions = model.predict(sample_padded)

print("Predictions:", predictions)

Predictions: [[0.62145734]
 [0.83895487]
 [0.0557374 ]
 [0.4164418 ]
 [0.78586394]
 [0.90520096]
 [0.90960026]
 [0.03631585]
 [0.8024434 ]
 [0.35514972]
 [0.5321297 ]
 [0.72439975]
 [0.917778  ]
 [0.5559894 ]
 [0.49044085]
 [0.22875585]
 [0.7758942 ]]
