# Initial Requirements

In [1]:
# Mount your google drive to the colab environment
from google.colab import drive
drive.mount('/content/gdrive')

# copy data
!cp '/content/gdrive/My Drive/Colab Notebooks/Twitter_Analysis/OLD_tweets_with_hashtag_AIDebate.txt' before_debate.txt
!cp '/content/gdrive/My Drive/Colab Notebooks/Twitter_Analysis/tweets_with_hashtag_AIDebate1.txt' after_debate.txt




# change enviroment variable of keras's backend to theano
import os; os.environ['KERAS_BACKEND'] = 'theano'

# get and install emotion predictor model
!git clone https://github.com/nikicc/twitter-emotion-recognition.git 
!ls # list of files

# go to the directory
%cd twitter-emotion-recognition/

# install required versions of the libraries
!pip install -r requirements.txt





# install library for language detection
!pip install langdetect

# download NLTK packages
import nltk
!python -m nltk.downloader all

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
fatal: destination path 'twitter-emotion-recognition' already exists and is not an empty directory.
after_debate.txt   gdrive	sample_data
before_debate.txt  newfile.txt	twitter-emotion-recognition
/content/twitter-emotion-recognition
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to /ro

# Emotion Analysis Part

In [2]:



# analyze each tweet
import datetime
from emotion_predictor import EmotionPredictor



class TweetEmotion:
  
  # load emotion prediction model
  def __init__(self):
    self.model = EmotionPredictor(classification='ekman', setting='mc')


  # what emotion?
  def what_emotion(self, tweet):
    prediction = self.model.predict_classes([tweet])
    # print('The associated emotion with [' + tweet + '] seems to be:' + '\n' + str(prediction))
    return prediction

  # probability of emotion
  def probability_of_emotion(self, tweet):
    probability = self.model.predict_probabilities([tweet])
    # print('The associated probablity of emotion with [' + tweet + '] seems to be:' + '\n' + str(probability))
    # return [probability['Anger'], probability['Disgust'], probability['Fear'],probability['Joy'], probability['Sadness'], probability['Surprise']]
    p = pd.Series({'Anger': float(probability['Anger']),'Disgust': float(probability['Disgust']), 'Fear': float(probability['Fear']), 'Joy': float(probability['Joy']), 'Sadness': float(probability['Sadness']),  'Surprise': float(probability['Surprise'])})
    return p



# tweet_emotion = TweetEmotion()






Using Theano backend.


# Tweet Cleaning and Sentiment Analysis

In [0]:
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import langdetect

def sentiment_and_freq(tweet):

  # check whether tweet's language is English
  is_english = False
  lang = langdetect.detect(tweet)
  if(lang != 'en'):
    is_english = False
    # print("non-English")
  else:
    # print("Yeah, English")
    is_english = True


  # Clean the Tweet

  # split into words
  tokens = word_tokenize(tweet)
  # convert to lower case
  tokens = [w.lower() for w in tokens]
  # remove punctuation from each word
  import string
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  # remove remaining tokens that are not alphabetic
  words = [word for word in stripped if word.isalpha()]
  # filter out stop words
  from nltk.corpus import stopwords
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if not w in stop_words]




  # detokenize to calculate sentiment polarity
  cleaned_sentence = TreebankWordDetokenizer().detokenize(words)
  polarity_of_tweet = TextBlob(cleaned_sentence).sentiment.polarity
  # print(polarity_of_tweet)





  # count specific words (speakers' first or family names)
  yoshua_name = ['yoshua' , 'bengio', 'yoshuabengio']
  gary_name = ['gary' , 'marcus', 'garymarcus']

  has_yoshua = any(word in cleaned_sentence for word in yoshua_name)
  has_gary = any(word in cleaned_sentence for word in gary_name)


  a = pd.Series({'is_english': is_english, 'polarity_of_tweet':polarity_of_tweet, 'has_yoshua':has_yoshua, 'has_gary':has_gary})
  return a




# Analyze All Tweets

In [4]:
import json
import pandas as pd
# import TweetEmotion

# csv file of the data
file = '/content/gdrive/My Drive/Colab Notebooks/Twitter_Analysis/ALL_tweets_with_hashtag_AIDebate.csv'

# read as pandas dataframe
df = pd.read_csv(file)

# standardize the time
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')

# initialize emotion prediction model
tweet_emotion = TweetEmotion()

# determine emotion
# df['emotions'] = df['text'].apply(tweet_emotion.probability_of_emotion)
df[['Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']] = df['text'].apply(tweet_emotion.probability_of_emotion)

# determine language of tweet, analyze its sentiment, and frequency of speakers' names
df[['is_english',  'polarity_of_tweet',  'has_yoshua',  'has_gary']] = df['text'].apply(sentiment_and_freq)
                     

print(df['date'])


0     2019-12-31 17:27:05
1     2019-12-31 17:11:53
2     2019-12-31 17:04:32
3     2019-12-31 14:36:41
4     2019-12-31 14:00:51
              ...        
435   2019-12-22 18:59:43
436   2019-12-22 18:30:11
437   2019-12-22 15:06:56
438   2019-12-22 15:01:19
439   2019-12-22 14:59:50
Name: date, Length: 440, dtype: datetime64[ns]


# Save the Results

In [0]:
save_path = '/content/gdrive/My Drive/Colab Notebooks/Twitter_Analysis/Analyzed_tweets.csv'
df.to_csv(save_path)