## Install the packages

In [None]:
!pip install tensorflow 
!pip install keras
!pip install Keras-Preprocessing
!pip install wordcloud


## Import packages and load data.

In [None]:
# Import necessary libraries
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sb
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.layers import Dense

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load CSV data
df = pd.read_csv('vaccination_tweets.csv')
df.head()


## Perform exploratory data analysis (EDA).

In [None]:
# Example: Calculate basic statistics for each column

print("Basic Statistics:")
print(df.describe())

In [None]:
# Example: Calculate sum of values for each column
print("Sum of Values:")
print(df.sum())

In [None]:
# Example: Calculate average of values for each column
print("Average of Values:")
print(df.mean())

In [None]:
# Example: Calculate maximum value for each column
print("Maximum Value:")
print(df.max())


In [None]:
# Example: Calculate minimum value for each column
print("Minimum Value:")
print(df.min())

In [None]:
# Example: Count number of non-null values for each column
print("Count of Non-null Values:")
print(df.count())

## Data Preprocessing

In [None]:
# Check for missing or null values
print(df.isnull().sum())


In [None]:
# Replace null values in 'user_location', 'user_description', 'hashtags', and 'source' columns
df['user_location'].fillna('', inplace=True)
df['user_description'].fillna('', inplace=True)
df['hashtags'].fillna('', inplace=True)
df['source'].fillna('', inplace=True)

In [None]:
# Check for missing or null values
print(df.isnull().sum())

In [None]:
# Convert text to lowercase
df['text'] = df['text'].str.lower() 
# Remove punctuation
df['text'] = df['text'].str.replace('[^\w\s]', '') 

In [None]:
# Plot the top value_counts
plt.figure(figsize=(20,10))
df['source'].value_counts().nlargest(30).plot(kind='bar')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Create heatmap
heatmap_data = df[['user_followers', 'user_friends', 'user_favourites', 'retweets', 'favorites']]
heatmap = sb.heatmap(heatmap_data.corr(), annot=True, cmap='coolwarm')

# Set plot title and labels
plt.title('Correlation Heatmap')
plt.xlabel('User Metrics')
plt.ylabel('User Metrics')

# Show the plot
plt.show()


In [None]:


# Extract hashtags from tweets
hashtags = df['hashtags'].str.split(',', expand=True).values.flatten()

# Count frequency of hashtags
hashtag_counts = pd.Series(hashtags).value_counts().head(10)  # Change 'head(10)' to desired number of top hashtags

# Create pie chart
plt.figure(figsize=(10, 5))
hashtag_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, explode=[0.1]*len(hashtag_counts))  # Use 'kind='pie'' for pie chart
plt.title('Top 10 Hashtags')
plt.axis('equal')  # Equal aspect ratio to make the pie chart a circle
plt.legend(hashtag_counts.index, loc='upper right')  # Show legend with hashtag labels
plt.show()


In [None]:
from wordcloud import WordCloud

# Extract hashtags from the text data
hashtags = ' '.join(data['hashtags'])

# Create a WordCloud object
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(hashtags)

# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 5))

# Plot the word cloud
ax.imshow(wordcloud, interpolation='bilinear')
ax.set_title('Most Common Hashtags in COVID Vaccine Discussions')
ax.axis('off')

# Show the word cloud
plt.show()

## VADER sentiment analyzer

In [None]:
# Load the NLTK VADER sentiment analyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()


# Define functions for sentiment scoring
def get_sentiment_score(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

def get_sentiment_label(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Calculate sentiment scores and labels
df['sentiment_score'] = df['text'].apply(get_sentiment_score)
df['sentiment_label'] = df['sentiment_score'].apply(get_sentiment_label)

sentiment_counts = df['sentiment_label'].value_counts()
sentiment_counts.plot(kind='bar', rot=0)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
plt.show()

# Data interpretation
positive_percentage = (sentiment_counts['positive'] / df.shape[0]) * 100
negative_percentage = (sentiment_counts['negative'] / df.shape[0]) * 100
neutral_percentage = (sentiment_counts['neutral'] / df.shape[0]) * 100

print(f'Positive percentage: {positive_percentage:.2f}%')
print(f'Negative percentage: {negative_percentage:.2f}%')
print(f'Neutral percentage: {neutral_percentage:.2f}%')



## Keras Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

# Load data from CSV
df = pd.read_csv('vaccination_tweets.csv')

# Split data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Preprocess text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['text'])
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])
max_len = max(len(seq) for seq in train_sequences)
train_data = pad_sequences(train_sequences, maxlen=max_len)
test_data = pad_sequences(test_sequences, maxlen=max_len)

# Convert target variable to numerical values
train_df['is_retweet'] = train_df['is_retweet'].map({False: 0, True: 1})
test_df['is_retweet'] = test_df['is_retweet'].map({False: 0, True: 1})

# Define Keras model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_len))
model.add(LSTM(units=64))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_data, train_df['is_retweet'], epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(test_data, test_df['is_retweet'])
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')
