In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import string 
string.punctuation
import nltk
from nltk.corpus import stopwords
import re
import unicodedata

In [None]:
#Downloading pre-stored stopwords in English language from NLTK Library

stop_words = set(stopwords.words('english'))

# **Loading Data**

In [None]:
data = pd.read_csv("/kaggle/input/joe-biden-tweets/JoeBidenTweets.csv")
data.head()

# **Visualization**

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

sentences = data['tweet'].tolist()
joined_sentences = ''.join(sentences)

plt.figure(figsize = (15,15))
plt.imshow(WordCloud(colormap='Dark2').generate(joined_sentences) )

# **Data Pre-Processing**

In [None]:
#Dropping irrelevant columns from the data
data = data.drop(columns=['id', 'url', 'timestamp', 'replies', 'retweets', 'likes', 'quotes'])

In [None]:
#Function to perform data pre-processing 

def preprocessing(text):
    lowercase = text.lower()
    punc_removal = [char for char in lowercase if char not in string.punctuation]
    punc_removal_joined = ''.join(punc_removal)
    url_removal = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', punc_removal_joined, flags=re.MULTILINE)
    emoji_removal = url_removal.encode('ascii', 'ignore').decode('ascii')
    rt_removal = re.sub("RT", "", emoji_removal)
    email_removal = re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', rt_removal)
    numbers_removal = re.sub(r'[0-9]', "", email_removal)
    stopwords_removal = [word for word in numbers_removal.split() if word not in stopwords.words('english')]
    return stopwords_removal

In [None]:
data['processed_tweet'] = data['tweet'].apply(preprocessing).astype(str)

In [None]:
#Assigning polarity scores using TextBlob 

from textblob import TextBlob
data['polarity'] = data['processed_tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
#Applying conditions to the polarity score and assigning target 0 and 1 (Negative & Positive) respectively

conditionList = [
                 data['polarity'] > 0,
                 data['polarity'] <= 0
                 ]
choiceList = ['1', '0']
data['target'] = np.select(conditionList, choiceList, default='no_label')

In [None]:
data.head()

In [None]:
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, Activation, Bidirectional, SimpleRNN
from sklearn.model_selection import train_test_split

# **Tokeinzing**

In [None]:
#Tokenizing the words and sentences.. Followed by padding to set a fixed length for all tweets 

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['processed_tweet'].values)

word_index = tokenizer.word_index

sentence = tokenizer.texts_to_sequences(data['processed_tweet'].values)
padding = pad_sequences(sentence, padding='post', maxlen = 22)

In [None]:
#Defining the size of vocabulary 

vocab_size = len(word_index) + 1

In [None]:
#Assigning X, y features and splitting them for training and testing 

X = padding
y = pd.get_dummies(data['target']).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.22, random_state = 42)

# **GloVe Word Embedding**

In [None]:
#Initializing GloVe word embedding 

embeddings_index = dict()
f = open('../input/glove-twitter/glove.twitter.27B.200d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((vocab_size, 200))
for word, i in tokenizer.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

# **Deep Learning / Neural Network Model**

In [None]:
embed_dim = 200
lstm_out = 128

model = Sequential()
model.add(Embedding(vocab_size, embed_dim,input_length = X.shape[1], weights=[embedding_matrix],trainable=False))
model.add(LSTM(lstm_out, return_sequences=True))
model.add(LSTM(lstm_out))
model.add(Dropout(0.3))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
model.fit(X_train, y_train, epochs = 10, verbose = 1, validation_data=(X_test, y_test))

# **Logisitic Regression**

In [None]:
#Train Test split for Logistic Regression

X = padding
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.22, random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
print("LR Score of Training Set" ,lr.score(X_train, y_train))
print("LR Score of Test Set" ,lr.score(X_test, y_test))

In [None]:
expected = y_test
predicted = lr.predict(X_test)

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn import metrics
cm = metrics.confusion_matrix(expected, predicted, labels = ['1','0'])
print(cm)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[1,0])
disp = disp.plot()

plt.show()

In [None]:
from sklearn import metrics
print(metrics.classification_report(expected, predicted))