In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint
from datetime import datetime
import collections
import re

import nltk
nltk.download('all')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag

from wordcloud import WordCloud

In [None]:
eng = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/3rd project/data/eng/tweets_labelled_09042020_16072020.csv', sep=';').set_index('id')
eng.shape

(5000, 3)

In [None]:
eng = eng[eng['sentiment'].notnull()]

In [None]:
ticker_pattern = re.compile(r'(^\$[A-Z]+|^\$ES_F)')
ht_pattern = re.compile(r'#\w+')

ticker_dic = collections.defaultdict(int)
ht_dic = collections.defaultdict(int)

for text in eng['text']:
    for word in text.split():
        if ticker_pattern.fullmatch(word) is not None:
            ticker_dic[word[1:]] += 1

            word = word.lower()
            if ht_pattern.fullmatch(word) is not None:
                ht_dic[word] += 1

In [None]:
charonly = re.compile(r'[^a-zA-Z\s]')
handle_pattern = re.compile(r'@\w+')
emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE)
url_pattern = re.compile(
    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
pic_pattern = re.compile('pic\.twitter\.com/.{10}')
special_code = re.compile(r'(&amp;|&gt;|&lt;)')
tag_pattern = re.compile(r'<.*?>')

STOPWORDS = set(stopwords.words('english')).union(
    {'rt', 'retweet', 'RT', 'Retweet', 'RETWEET'})

lemmatizer = WordNetLemmatizer()

def hashtag(phrase):
    return ht_pattern.sub(' ', phrase)

def remove_ticker(phrase):
    return ticker_pattern.sub('', phrase)
    
def specialcode(phrase):
    return special_code.sub(' ', phrase)

def emoji(phrase):
    return emoji_pattern.sub(' ', phrase)

def url(phrase):
    return url_pattern.sub('', phrase)

def pic(phrase):
    return pic_pattern.sub('', phrase)

def html_tag(phrase):
    return tag_pattern.sub(' ', phrase)

def handle(phrase):
    return handle_pattern.sub('', phrase)

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    
    # DIS, ticker symbol of Disney, is interpreted as the plural of "DI" 
    # in WordCloud, so I converted it to Disney
    phrase = re.sub('DIS', 'Disney', phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"(he|He)\'s", "he is", phrase)
    phrase = re.sub(r"(she|She)\'s", "she is", phrase)
    phrase = re.sub(r"(it|It)\'s", "it is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"(\'ve|has)", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def onlychar(phrase):
    return charonly.sub('', phrase)

def remove_stopwords(phrase):
    return " ".join([word for word in str(phrase).split()\
                     if word not in STOPWORDS])

def tokenize_stem(phrase):   
    tokens = word_tokenize(phrase)
    stem_words =[]
    for token in tokens:
        word = lemmatizer.lemmatize(token)
        stem_words.append(word)        
    buf = ' '.join(stem_words)    
    return buf

In [None]:
def arrange_text(ds):
    ds['text2'] = ds['text'].apply(emoji)
    ds['text2'] = ds['text2'].apply(handle)
    ds['text2'] = ds['text2'].apply(specialcode)
    ds['text2'] = ds['text2'].apply(hashtag)
    ds['text2'] = ds['text2'].apply(url)
    ds['text2'] = ds['text2'].apply(pic)
    ds['text2'] = ds['text2'].apply(html_tag)
    ds['text2'] = ds['text2'].apply(onlychar)
    ds['text2'] = ds['text2'].apply(decontracted)
    ds['text2'] = ds['text2'].apply(onlychar)
    ds['text2'] = ds['text2'].apply(tokenize_stem)
    ds['text2'] = ds['text2'].apply(remove_stopwords)

In [None]:
arrange_text(eng)
eng.head()

Unnamed: 0_level_0,created_at,text,sentiment,text2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",positive,Yo Enter WIN Monarch Tokens US Stock Market Cr...
661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,negative,surcharge fuel removed The surcharge Rs impose...
413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,positive,Net issuance increase fund fiscal program yiel...
760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,positive,How much Amazons traffic served Fastly Help u ...
830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,positive,AMD Ryzen desktop CPUs looking great track launch


In [None]:
eng = eng.replace({'sentiment': 'positive'}, {'sentiment': 0})
eng = eng.replace({'sentiment': 'neutral'}, {'sentiment': 1})
eng = eng.replace({'sentiment': 'negative'}, {'sentiment': 2})

eng.head()

Unnamed: 0_level_0,created_at,text,sentiment,text2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",0,Yo Enter WIN Monarch Tokens US Stock Market Cr...
661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,2,surcharge fuel removed The surcharge Rs impose...
413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,0,Net issuance increase fund fiscal program yiel...
760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,0,How much Amazons traffic served Fastly Help u ...
830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,0,AMD Ryzen desktop CPUs looking great track launch


In [None]:
eng['dic_s'] = ""
eng.head()

Unnamed: 0_level_0,created_at,text,sentiment,text2,dic_s
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",0,Yo Enter WIN Monarch Tokens US Stock Market Cr...,
661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,2,surcharge fuel removed The surcharge Rs impose...,
413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,0,Net issuance increase fund fiscal program yiel...,
760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,0,How much Amazons traffic served Fastly Help u ...,
830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,0,AMD Ryzen desktop CPUs looking great track launch,


* https://techblog-history-younghunjo1.tistory.com/111

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

eng['text2'] = eng['text2'].str.replace('<br />', ' ')
eng['text2'] = eng['text2'].apply(lambda x : re.sub("[^a-zA-Z]", ' ', x))
print(eng['text2'][77522])

senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(eng['text2'][77522])
print(senti_scores)

Yo Enter WIN Monarch Tokens US Stock Market Crashes LEARN PT WATCH video
{'neg': 0.0, 'neu': 0.726, 'pos': 0.274, 'compound': 0.6739}


In [None]:
senti_scores['compound']

0.6739

In [None]:
eng['sentiment'].values

array([0, 2, 0, ..., 1, 0, 2])

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.metrics import recall_score, f1_score

def get_sentiment(review):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)

    compound_score = scores['compound']
    if compound_score > 0.2:
        final_sentiment = 0
    elif compound_score < -0.2:
        final_sentiment = 2
    else:
        final_sentiment = 1
    return final_sentiment

eng['dic_s'] = eng['text2'].apply(lambda x : get_sentiment(x))

y_target = eng['sentiment'].values
y_pred = eng['dic_s'].values

print('### VADER 예측 성능 평가')
print(confusion_matrix(y_target, y_pred))
print('정확도 : ', accuracy_score(y_target, y_pred))
# print('정밀도 : ', precision_score(y_target, y_pred))
# print('재현율 : ', recall_score(y_target, y_pred))
# print('F1 score : ', f1_score(y_target, y_pred))

### VADER 예측 성능 평가
[[368 142  18]
 [ 79 331  14]
 [ 71 122 155]]
정확도 :  0.6569230769230769


* threshold 0.05
[[396 108  24]
[ 92 307  25]
[ 81  78 189]]
정확도 :  0.6861538461538461
* threshold 0.1
[[390 115  23]
 [ 88 312  24]
 [ 78  88 182]]
정확도 :  0.68
* threshold 0.2
[[368 142  18]
 [ 79 331  14]
 [ 71 122 155]]
정확도 :  0.6569230769230769

In [None]:
eng.head()

Unnamed: 0_level_0,created_at,text,sentiment,text2,dic_s,0/2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",0,Yo Enter WIN Monarch Tokens US Stock Market Cr...,0,0
661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,2,surcharge fuel removed The surcharge Rs impose...,0,2
413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,0,Net issuance increase fund fiscal program yiel...,2,0
760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,0,How much Amazons traffic served Fastly Help u ...,0,0
830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,0,AMD Ryzen desktop CPUs looking great track launch,0,0


* compound 값과 threshold가 동일해야 neutral로 지정했기 때문에 neutral이 나오기 어려움
* 기준을 정해서 neutral 범위를 설정하거나 해야할 듯

In [None]:
eng['dic_s'].value_counts()

0    569
1    493
2    238
Name: dic_s, dtype: int64

In [None]:
eng['0/2'] = eng['sentiment']
eng.head()

Unnamed: 0_level_0,created_at,text,sentiment,text2,dic_s,0/2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
77522,2020-04-15 01:03:46+00:00,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",0,Yo Enter WIN Monarch Tokens US Stock Market Cr...,0,0
661634,2020-06-25 06:20:06+00:00,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,2,surcharge fuel removed The surcharge Rs impose...,0,2
413231,2020-06-04 15:41:45+00:00,Net issuance increases to fund fiscal programs...,0,Net issuance increase fund fiscal program yiel...,2,0
760262,2020-07-03 19:39:35+00:00,RT @bentboolean: How much of Amazon's traffic ...,0,How much Amazons traffic served Fastly Help u ...,0,0
830153,2020-07-09 14:39:14+00:00,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,0,AMD Ryzen desktop CPUs looking great track launch,0,0


In [None]:
eng = eng.replace({'0/2' : {1:0}})

In [None]:
eng['0/2'].value_counts()

0    952
2    348
Name: 0/2, dtype: int64

In [None]:
y_target = eng['0/2']
y_pred = eng['dic_s']

print(confusion_matrix(y_target, y_pred))
print('정확도 : ', accuracy_score(y_target, y_pred))
print('정밀도 : ', precision_score(y_target, y_pred))
print('재현율 : ', recall_score(y_target, y_pred))
print('F1 score : ', f1_score(y_target, y_pred))