In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
import pandas as pd
import numpy as np

In [23]:
df = pd.read_csv('/content/drive/MyDrive/AI and ML/Week 8/Copy of trum_tweet_sentiment_analysis.csv')

In [24]:
df_text=df[['text']]

In [25]:
df_text.dropna()

Unnamed: 0,text
0,RT @JohnLeguizamo: #trump not draining swamp b...
1,ICYMI: Hackers Rig FM Radio Stations To Play A...
2,Trump protests: LGBTQ rally in New York https:...
3,"""Hi I'm Piers Morgan. David Beckham is awful b..."
4,RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...
...,...
1850118,Everytime im like 'How the fuck I follow Melan...
1850119,RT @imgur: The Trump Handshake. https://t.co/R...
1850120,"""Greenspan warns Trump's policies risk inflati..."
1850121,RT @FasinatingLogic: We must also #INVESTIGATE...


In [26]:
import re
def remove_urls(text):
  """
  This function will try to remove URL present in out dataset and replace it with space using regex library.
  Input Args:
  text: strings of text that may contain URLs.
  Output Args:
  text: URLs replaces with text
  """
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
  return url_pattern.sub(r'', text)


In [27]:
def remove_emoji(string):
  """
  This function will replace the emoji in string with whitespace
  """
  emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r' ', string)

In [28]:
def removeunwanted_characters(document):
  """
  This function will remove all the unwanted characters from the input dataset.
  Input Args:
  documet: A text data to be cleaned.
  Return:
  A cleaned document.
  """
  # remove user mentions
  document = re.sub("@[A-Za-z0-9_]+"," ", document)
  # remove hashtags
  document = re.sub("#[A-Za-z0-9_]+","", document)
  # remove punctuation
  document = re.sub("[^0-9A-Za-z ]", "" , document)
  #remove emojis
  document = remove_emoji(document)
  # remove double spaces
  document = document.replace('  ',"")
  return document.strip()

In [29]:
import nltk
nltk.download('punkt_tab')
from nltk import word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [30]:
from nltk.tokenize import RegexpTokenizer

from nltk.tokenize import RegexpTokenizer

def remove_punct(text):
  """
  This function removes the punctutations present in our text data.
  Input Args:
  text: text data.
  Returns:
  text: cleaned text.
  """
  tokenizer = RegexpTokenizer(r"\w+")
  lst=tokenizer.tokenize(' '.join(text))
  return lst


In [31]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
custom_stopwords = ['@', 'RT']
stop_words.update(custom_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:

def remove_stopwords(text_tokens):
  """
  This function removes all the stopwords present in out text tokens.
  Input Args:
  text_tokens: tokenize input of our datasets.
  Returns:
  result_tokens: list of token without stopword.
  """

  result_tokens = []
  for token in text_tokens:
    if token not in stop_words:
       result_tokens.append(token)
  return result_tokens

In [33]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def lemmatization(token_text):
  """
  This function performs the lemmatization operations as explained above.
  Input Args:
  token_text: list of tokens.
  Returns:
  lemmatized_tokens: list of lemmatized tokens.
  """
  lemma_tokens = []
  wordnet = WordNetLemmatizer()
  lemmatized_tokens = [wordnet.lemmatize(token, pos = 'v') for token in token_text]

  return lemmatized_tokens




[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
from nltk.stem import PorterStemmer

def stemming(text):
  """
  This function performs stemming operations.
  Input Args:
  token_text: list of tokenize text.
  Returns:
  stemm_tokes: list of stemmed tokens.
  """
  porter = PorterStemmer()
  stemm_tokens = []
  for word in text:
    stemm_tokens.append(porter.stem(word))
  return stemm_tokens

In [35]:
def lower_order(text):
  """
  This function converts all the text in input text to lower order.
  Input Args:
  token_text : input text.
  Returns:
  small_order_text : text converted to small/lower order.
  """
  small_order_text = text.lower()
  return small_order_text

# Test:
sample_text = "This Is some Normalized TEXT"
sample_small = lower_order(sample_text)
print(sample_small)


this is some normalized text


In [36]:
def text_cleaning_pipeline(dataset, rule = "lemmatize"):
  """
  This...
  """
  # Convert the input to small/lower order.
  data = lower_order(dataset)
  # Remove URLs
  data = remove_urls(dataset)
  # Remove emojis
  data = remove_emoji(dataset)
  # Remove all other unwanted characters.
  data = removeunwanted_characters(dataset)
  # Create tokens.
  tokens = data.split()
  # Remove stopwords:
  tokens = remove_stopwords(tokens)
  # Stemming or Lemmatization:
  if rule == "lemmatize":
    tokens = lemmatization(tokens)
  elif rule == "stem":
    tokens = stemming(tokens)
  else:
    print("Pick between lemmatize or stem")


  return " ".join(tokens)



In [37]:
# Apply text cleaning to all rows in the text column
df['cleaned_text'] = df['text'].apply(text_cleaning_pipeline)
df = df[['cleaned_text', 'Sentiment']]
df.dropna(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [38]:
!pip install tensorflow
!pip install Keras



In [39]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['Sentiment'], test_size=0.2, random_state=42)

# Tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding (95th percentile)
import numpy as np

seq_lengths = [len(seq) for seq in X_train_seq]
max_len = int(np.percentile(seq_lengths, 95))

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [40]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint

# RNN Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    SimpleRNN(64),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Callbacks
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, mode='max')
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=64,
                    validation_split=0.2, callbacks=[checkpoint, early_stop])




Epoch 1/10
[1m18500/18502[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - accuracy: 0.9052 - loss: 0.2354



[1m18502/18502[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m414s[0m 22ms/step - accuracy: 0.9052 - loss: 0.2354 - val_accuracy: 0.9472 - val_loss: 0.1481
Epoch 2/10
[1m18501/18502[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.9526 - loss: 0.1318



[1m18502/18502[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m427s[0m 21ms/step - accuracy: 0.9526 - loss: 0.1318 - val_accuracy: 0.9507 - val_loss: 0.1391
Epoch 3/10
[1m18499/18502[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - accuracy: 0.9606 - loss: 0.1120



[1m18502/18502[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m431s[0m 21ms/step - accuracy: 0.9606 - loss: 0.1120 - val_accuracy: 0.9529 - val_loss: 0.1437
Epoch 4/10
[1m18499/18502[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.9649 - loss: 0.1009



[1m18502/18502[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 21ms/step - accuracy: 0.9649 - loss: 0.1009 - val_accuracy: 0.9537 - val_loss: 0.1327
Epoch 5/10
[1m18502/18502[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 21ms/step - accuracy: 0.9669 - loss: 0.0956 - val_accuracy: 0.9516 - val_loss: 0.1359
Epoch 6/10
[1m18502/18502[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.9682 - loss: 0.0919



[1m18502/18502[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 21ms/step - accuracy: 0.9682 - loss: 0.0919 - val_accuracy: 0.9545 - val_loss: 0.1329
Epoch 7/10
[1m18502/18502[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m446s[0m 21ms/step - accuracy: 0.9687 - loss: 0.0901 - val_accuracy: 0.9524 - val_loss: 0.1348


In [41]:

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


[1m11564/11564[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step
Accuracy: 0.953622052564016
Confusion Matrix:
 [[238937   9626]
 [  7535 113927]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.97    248563
           1       0.92      0.94      0.93    121462

    accuracy                           0.95    370025
   macro avg       0.95      0.95      0.95    370025
weighted avg       0.95      0.95      0.95    370025

