<a href="https://colab.research.google.com/github/sean-halpin/ml_tweepy_proj/blob/main/svm_text_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
import os
os.chdir("/content/drive/My Drive/Colab Notebooks")

In [None]:
!pip install vaderSentiment
!pip install spellchecker
!pip install pyspellchecker
!pip install plot_keras_history

In [None]:
import numpy as np
import pandas as pd 
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import *
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [None]:
stop_words = set(stopwords.words('english'))
def no_stopwords(text):
  tokenwords = word_tokenize(text) 
  result = [w for w in tokenwords if not w in stop_words] 
  result = [] 
  for w in tokenwords: 
      if w not in stop_words: 
          result.append(w)
  return " ".join(result)

In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

In [None]:
output= string.punctuation
print('list of punctuations:', output)

In [None]:
def prepare_text_df(df):
  df['tweet'] = df['tweet'].str.replace('http\S+|www.\S+|@.\S+|&amp;.\S+|<.*?>', '', case=False)
  df['tweet'] = df['tweet'].str.lower()
  df['tweet'] = df['tweet'].str.strip()
  df['tweet'] = df['tweet'].str.translate(str.maketrans('', '', output))
  df['lemmatized_tweet'] = df['tweet'].apply(lemmatize_text).apply(no_stopwords)
  df = df.drop_duplicates('lemmatized_tweet', keep='last')
  return df

In [None]:
def random_under_sample(X,y):
  rus = RandomUnderSampler(random_state=0)
  X_resampled, y_resampled = rus.fit_resample(X,y)
  print(sorted(Counter(y_resampled).items()))
  return X_resampled.flatten(), y_resampled.flatten()

In [None]:
def load_data_musk():
  df = pd.read_csv("tweets_annotated.1650577206.elonmusk.csv")
  df = prepare_text_df(df)
  X = df.lemmatized_tweet.values.reshape(-1,1)
  y = df.sentiment.values.reshape(-1,1)
  X_resampled, y_resampled = random_under_sample(X,y)
  df_resampled = pd.DataFrame ({ 'lemmatized_tweet': X_resampled, 'sentiment': y_resampled})
  return df_resampled

In [None]:
df_resampled = load_data_musk()

In [None]:
df_resampled.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_resampled.lemmatized_tweet, df_resampled.sentiment, test_size=0.2, random_state=32)

# Text Classifier

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer(max_features=100)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=3, tol=None)),
])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import seaborn as sns
def clf_metrics(y_test, predicted):
  print(metrics.classification_report(y_test, predicted))
  c_matrix = metrics.confusion_matrix(y_test, predicted)
  ax = sns.heatmap(c_matrix, annot=True,      
                    xticklabels=['negative','neutral','positive'],                
                    yticklabels=['negative','neutral','positive'],                
                  cbar=True, cmap='Blues', fmt='g')
  ax.set_xlabel("Prediction")
  ax.set_ylabel("Actual")

In [None]:
clf_metrics(y_test, predicted)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve


def plot_learning_curve(
    estimator,
    title,
    X,
    y,
    axes=None,
    ylim=None,
    cv=None,
    n_jobs=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
):
    if axes is None:
        _, (ax) = plt.subplots(1, 1, figsize=(20, 5))

    ax.set_title(title)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        return_times=True,
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    ax.grid()
    ax.fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    ax.fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    ax.plot(
        train_sizes, train_scores_mean, "o-", color="r", label="Training score"
    )
    ax.plot(
        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
    )
    ax.legend(loc="best")
    return plt


plot_learning_curve(
    text_clf, "LC ", X=X_test, y=y_test,ylim=(0.1, 1.01), cv=5, n_jobs=-1
)

plt.show()

# SVM Classification Performance against another topic

In [None]:
def load_data_formula_one():
  df_f1 = pd.read_csv("tweets_annotated.1650575029.formulaone.csv")
  df_f1 = prepare_text_df(df_f1)
  X_f1 = df_f1.lemmatized_tweet.values.reshape(-1,1)
  y_f1 = df_f1.sentiment.values.reshape(-1,1)
  X_f1_resampled, y_f1_resampled = random_under_sample(X_f1,y_f1)
  return pd.DataFrame ({ 'lemmatized_tweet': X_f1_resampled, 'sentiment': y_f1_resampled})

In [None]:
df_f1_resampled = load_data_formula_one()

In [None]:
predicted_f1 = text_clf.predict(df_f1_resampled.lemmatized_tweet)

In [None]:
clf_metrics(df_f1_resampled.sentiment, predicted_f1)

# Vader

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
analyzer = SentimentIntensityAnalyzer()
def vader_sentiment(sentence):
  vs = analyzer.polarity_scores(sentence)
  comp = vs['compound']
  if comp > 0.05:
    return 'positive'
  elif comp < -0.05:
    return 'negative'
  else:
    return 'neutral'

In [None]:
vader_predicted = X_test.apply(vader_sentiment)

In [None]:
clf_metrics(y_test, vader_predicted)

In [None]:
len(X_test[(y_test == "neutral") & (vader_predicted == "positive")])

# Deep Neural Network Classifier

In [None]:
import numpy as np
import pandas as pd
import re
import gensim
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
df_resampled = load_data_musk()

In [None]:
def labels_categorical(data):
  labels = np.array(data)
  y = []
  for i in range(len(labels)):
      if labels[i] == 'neutral':
          y.append(0)
      if labels[i] == 'negative':
          y.append(1)
      if labels[i] == 'positive':
          y.append(2)
  y = np.array(y)
  labels = tf.keras.utils.to_categorical(y, 3, dtype="float32")
  del y
  return labels

In [None]:
labels = labels_categorical(df_resampled.sentiment)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers

max_words = 500
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_resampled.lemmatized_tweet)
sequences = tokenizer.texts_to_sequences(df_resampled.lemmatized_tweet)
tweets = pad_sequences(sequences, maxlen=max_len)
print(tweets)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.3, random_state=32)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from keras.models import Sequential
from keras import layers
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from keras.layers import Embedding
embedding_layer = Embedding(128, 16)

model1 = Sequential()
model1.add(layers.Embedding(max_words, 12))
model1.add(layers.LSTM(12,dropout=0.4, return_sequences=False))
model1.add(layers.Dense(3,activation='softmax'))

model1.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model1.summary()

In [None]:
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model1.fit(X_train, y_train, epochs=20,validation_data=(X_test, y_test),callbacks=[checkpoint1])

In [None]:
import keras

best_model = keras.models.load_model("best_model1.hdf5")

In [None]:
test_loss, test_acc = best_model.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

In [None]:
predictions = best_model.predict(X_test)

In [None]:
clf_metrics(y_test.argmax(axis=1), np.around(predictions, decimals=0).argmax(axis=1))

In [None]:
from plot_keras_history import show_history, plot_history
import matplotlib.pyplot as plt

show_history(history)
plot_history(history, path="standard.png")
plt.close()