In [0]:
# created by Steve

import os
import pandas as pd
import re
import string
import datetime


from textblob import TextBlob

#!pip install tweet-preprocessor
#import preprocessor as p

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('rslp')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem.porter import *

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [0]:
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

# sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

#Emoji 
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

#combinamos
emoticons = emoticons_happy.union(emoticons_sad)

In [0]:
# declaramos funçoes
def remove_emoticons(s):
  tweet_tokenizer = TweetTokenizer()
  tokens = tweet_tokenizer.tokenize(s)
  s = " ".join([word for word in tokens if word not in emoticons])
  return s
  
def remove_punct(s):
    s = "".join([char for char in s if char not in string.punctuation])
    s = re.sub('[0-9]+', '', s)
    return s

def remove_unuseful(s):
    s = re.sub(r'http\S+', '', s)
    s = re.sub('(RT|via)((?:\\b\\W*@\\w+)+)', ' ', s)
    s = re.sub(r'@\S+', '', s)
    s = re.sub('&amp', '', s)
    s = re.sub(r'[^\x00-\x7F]+',' ', s)

    return s
  
def remove_stopwords(text, lang,  are_tweets = False, domain_stopwords=[]):
  
  stop_words = nltk.corpus.stopwords.words(lang) # lang='portuguese' or lang='english'
  
  s = str(text).lower() # tudo para caixa baixa
  #table = str.maketrans({key: None for key in string.punctuation})
  #s = s.translate(table) # remove pontuacao
  
  tokens = 0
  if are_tweets:
    tweet_tokenizer = TweetTokenizer()
    tokens = tweet_tokenizer.tokenize(s)
  else:
    tokens = word_tokenize(s) #obtem tokens
  
  v = [i for i in tokens if not i in stop_words and not i in domain_stopwords and not i.isdigit()] # remove stopwords
  s = ""
  for token in v:
    s += token+" "
  return s.strip()

# stemming
def stemming(text,lang, are_tweets=False):

  if lang=='portuguese':
    stemmer = nltk.stem.RSLPStemmer() # stemming para portuguese
  else:
    stemmer = PorterStemmer() # stemming para ingles

  if are_tweets:
    tweet_tokenizer = TweetTokenizer()
    tokens = tweet_tokenizer.tokenize(text)
  else:
    tokens = word_tokenize(s) #obtem tokens
    
  sentence_stem = ''
  doc_text_stems = [stemmer.stem(i) for i in tokens]
  for stem in doc_text_stems:
    sentence_stem += stem+" "
    
  return sentence_stem.strip()

def preprocess_text(text, lang='english', are_tweets = False, domain_stopwords=[]):
  text = remove_stopwords(text, lang, True, domain_stopwords)
  text = remove_emoticons(text)
  text = remove_unuseful(text)
  text = remove_punct(text)
  #text = stemming(text, lang, True)

  if are_tweets:
    tweet_tokenizer = TweetTokenizer()
    tokens = tweet_tokenizer.tokenize(text)
  else:
    tokens = word_tokenize(text) #obtem tokens)

  return ' '.join(tokens)

def process_tweets(dataset, lang):   

  tweets = []
  for index, row in dataset.iterrows():

    clean_tweet = preprocess_text(row['text'], are_tweets = True)

    #The polarity score is a float within the range [-1.0, 1.0]. 
    #The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.
    blob = TextBlob(clean_tweet)
    Sentiment = blob.sentiment

    polarity = Sentiment.polarity
    subjectivity = Sentiment.subjectivity

    new_entry = [row['date'], ['text'], clean_tweet, polarity, subjectivity]

    tweets.append(new_entry)
    
  return tweets

In [0]:
orig_text = "The #IndustryMarketplace is the turning point for #Industry40 and brings us one step closer to a fully autonomous and decentralized #MachineEconomy. Learn more about the platform &amp; its architecture. https://t.co/pDWtdGlAAu Participate now at https://t.co/m0yHmT1OXu"
print(orig_text)
text = preprocess_text(orig_text, are_tweets=True)
print(orig_text)
print("%s polarity: %s subjectivity %s" % (text, TextBlob(text).polarity, TextBlob(text).subjectivity))


text = "I do not like a iphone :) :-) ;) =D"
text = preprocess_text(text,'english', True)
text = stemming(text, 'english', True)
print("%s | polarity: %s subjectivity %s" % (text, TextBlob(text).polarity, TextBlob(text).subjectivity))

text = "I like a iphone,%.!>! :) :-) ;) =D"
text = preprocess_text(text,'english', True)
print("%s | polarity: %s subjectivity %s" % (text, TextBlob(text).polarity, TextBlob(text).subjectivity))


like iphon d | polarity: 0.0 subjectivity 0.0
like iphone d | polarity: 0.0 subjectivity 0.0


In [0]:
from google.colab import drive
drive.mount('/content/drive')

#columns of the csv file
COLS = ['date', 'text', 'clean_text', 'polarity','subjectivity']

dataset = pd.read_csv('/content/drive/My Drive/data/tweets_iota.csv', encoding='utf-8')

tweets = process_tweets(dataset, 'english')
#tweets = get_tweets(username)
df = pd.DataFrame(tweets, columns=COLS)

df['date'] = pd.to_datetime(df["date"])

# indexamos 
df['date'] = df["date"].apply( lambda df : 
datetime.datetime(year=df.year, month=df.month, day=df.day))	
df.set_index(df["date"],inplace=True)


sentimento = df.resample('D').mean()
#df['polarity'].resample('D', how='mean')

# remplaza valores NaN por cero en dias que nao teve tweets
sentimento.fillna(0, inplace=True)
sentimento.head()






**Visualizamos**

In [0]:
fig, ax = plt.subplots(figsize=(15,7))
#df.groupby(['date']).mean()['polarity'].plot(ax=ax)
df.resample('D').mean()['polarity'].plot(ax=ax)
#df.resample('D').mean()['subjectivity'].plot(ax=ax)

In [0]:
# grabamos a .csv
sentimento.to_csv('sentimento_iota.csv', columns=['polarity', 'subjectivity'], index=True, encoding="utf-8")
print("dados salvos ...")
