<a href="https://colab.research.google.com/github/sean-halpin/ml_tweepy_proj/blob/main/svm_text_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install vaderSentiment
!pip install spellchecker
!pip install pyspellchecker

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
import os
os.chdir("/content/drive/My Drive/Colab Notebooks")

In [None]:
import numpy as np
import pandas as pd 
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import *
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
df = pd.read_csv("tweets_annotated.1650577206.elonmusk.csv")

In [None]:
from spellchecker import SpellChecker

spelling = SpellChecker()
def spelling_checks(text):
    correct_result = []
    typo_words = spelling.unknown(text.split())
    for word in text.split():
        if word in typo_words:
            correct_result.append(spelling.correction(word))
        else:
            correct_result.append(word)
    return " ".join(correct_result)

In [None]:
stop_words = set(stopwords.words('english'))
def no_stopwords(text):
  tokenwords = word_tokenize(text) 
  result = [w for w in tokenwords if not w in stop_words] 
  result = [] 
  for w in tokenwords: 
      if w not in stop_words: 
          result.append(w)
  return " ".join(result)

In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

In [None]:
output= string.punctuation
print('list of punctuations:', output)

In [None]:
df['tweet'] = df['tweet'].str.replace('http\S+|www.\S+|@.\S+|&amp;.\S+|<.*?>', '', case=False)
df['tweet'] = df['tweet'].str.lower()
df['tweet'] = df['tweet'].str.strip()
df['tweet'] = df['tweet'].str.translate(str.maketrans('', '', output))
df['lemmatized_tweet'] = df['tweet'].apply(lemmatize_text).apply(no_stopwords)

In [None]:
df = df.drop_duplicates('tweet', keep='last')

In [None]:
X = df.lemmatized_tweet.values.reshape(-1,1)
y = df.sentiment.values.reshape(-1,1)

In [None]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X,y)
print(sorted(Counter(y_resampled).items()))

In [None]:
df_resampled = pd.DataFrame ({ 'lemmatized_tweet': X_resampled.flatten(), 'sentiment': y_resampled})

In [None]:
df_resampled.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_resampled.lemmatized_tweet, df_resampled.sentiment, test_size=0.3, random_state=32)

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import seaborn as sns
def clf_metrics(y_test, predicted):
  print(metrics.classification_report(y_test, predicted))
  c_matrix = metrics.confusion_matrix(y_test, predicted)
  ax = sns.heatmap(c_matrix, annot=True,      
                    xticklabels=['negative','neutral','positive'],                
                    yticklabels=['negative','neutral','positive'],                
                  cbar=True, cmap='Blues', fmt='g')
  ax.set_xlabel("Prediction")
  ax.set_ylabel("Actual")

In [None]:
clf_metrics(y_test, predicted)

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
analyzer = SentimentIntensityAnalyzer()
def vader_sentiment(sentence):
  vs = analyzer.polarity_scores(sentence)
  comp = vs['compound']
  if comp > 0.05:
    return 'positive'
  elif comp < -0.05:
    return 'negative'
  else:
    return 'neutral'

In [None]:
vader_predicted = X_test.apply(vader_sentiment)

In [None]:
clf_metrics(y_test, vader_predicted)

In [None]:
pd.set_option('max_colwidth', -1)

In [None]:
len(X_test[(y_test == "neutral") & (vader_predicted == "positive")])

In [None]:
X_test[(y_test == "neutral") & (vader_predicted == "positive")]