In [18]:
# !pip install tweepy==3.10.0

In [None]:
# https://realpython.com/twitter-bot-python-tweepy/

In [19]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation
from datetime import datetime, date, timedelta
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import tweepy
import time
import copy

In [20]:
# 讀取資料並指定標籤
labels = ['polarity', 'id', 'date', 'query', 'user', 'text']
data = pd.read_csv("../Data/training.1600000.processed.noemoticon.csv", 
                   names=labels,
                   encoding='latin-1')
data = data.dropna()

In [21]:
# 只保留文字內容和極性，將極性改為 0(負面推文)、1(正面推文)
data = data[['text', 'polarity']]
data.polarity.replace(4, 1, inplace=True)

In [22]:
# 創建一個停用詞列表
# from nltk.corpus import stopwords
stops = stopwords.words("english")

# 添加不帶單引號的停用詞 dont(原先為don't) 加入停用詞
no_quotes = []
for word in stops:
    if "'" in word:
        no_quotes.append(re.sub(r'\'', '', word))
stops.extend(no_quotes)


In [23]:
# 刪除推文中的主題標籤(hashtag)、URL、HTML屬性等
def clean_string(string):
    # 刪除 HTML 特殊字元
    tmp = re.sub(r'\&\w*;', '', string)
    # 刪除 @user
    tmp = re.sub(r'@(\w+)', '', tmp)
    # 刪除鏈結
    tmp = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\\./]+', '', tmp)
    # 轉小寫
    tmp = tmp.lower()
    # 刪除主題標籤
    tmp = re.sub(r'#(\w+)', '', tmp)
    # 刪除重複字元
    tmp = re.sub(r'(.)\1{1,}', r'\1\1', tmp)
    # 刪除任何不是字母的東西
    tmp = re.sub("[^a-zA-Z]", " ", tmp)
    # 刪除少於兩個字元的任何內容
    tmp = re.sub(r'\b\w{1,2}\b', '', tmp)
    # 刪除多個空格
    tmp = re.sub(r'\s\s+', ' ', tmp)
    return tmp

In [24]:
# # 詞幹提取(stemming):將單詞還原為字根 EX:love、loving、loved都視為love
# stemmer = PorterStemmer()
# print(stemmer.stem('working'))

In [25]:
# 刪除所有標點符號以及停用詞，並提取每個單字的詞幹
def preprocess(string):
#     from nltk.stem import PorterStemmer
#     from string import punctuation
    stemmer = PorterStemmer()
    # 刪除標點符號
    removed_punc = ''.join([char for char in string if char not in punctuation])

    cleaned = []
    # 刪除停用詞
    for word in removed_punc.split(' '):
        if word not in stops:
            cleaned.append(stemmer.stem(word.lower()))
    return ' '.join(cleaned)

In [26]:
data = data.sample(frac=1).reset_index(drop=True) # frac (float)：要抽出的比例，0~1  https://ithelp.ithome.com.tw/articles/10233644
data.text = data.text.apply(clean_string)
data.text = data.text.apply(preprocess)

In [None]:
# 使用投票法分類器
# 使用20,000數值向量大小以及範圍1到3的n-gram 
train_size = 10000

# from sklearn.feature_extraction.text import TfidfVectorizer
# 初始化 TfidfVectorizer 函式
tf = TfidfVectorizer(max_features=20000, 
                     ngram_range=(1, 3),
                     stop_words='english')
# 將文字資料轉換成數值向量
tf.fit(data.text)
transformed = tf.transform(data.text)
# 將稀疏矩陣轉換成numpy矩陣
x_data = transformed[:train_size].toarray() # 將資料轉換成模型可以接收的numpy矩陣資料型別
y_data = data.polarity[:train_size].values

voting = VotingClassifier([('LR', DecisionTreeClassifier()),
                           ('NB', MultinomialNB()),
                           ('Ridge', RidgeClassifier())])

voting.fit(x_data, y_data)



In [None]:
api_key = 'tsj4P1P8phL90G2QQF1HkQJFh'
api_key_secret = 'kc8X3dglmH1VKjk14KdhoYGTNe7AabvuJ4hi5pwgNvuwklMWit'
access_token = '1367899270444154883-aEisBRiV9QQYQcBBIS7xyloddHbJJT'
access_token_secret = 'TpaAiMfFcvPBg765tDeQcxNkFm4HK1TOyrK6E1jwGemrd'

auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit = True) # 當請求次數超過上限時會拋出異常，然後退出程序，解法為將參數wait_on_rate_limit_notify設置爲True

In [None]:
# 建立連線，用一個迴圈去抓Twitter上關於美國總統拜登的相關推文
# 將抓到的推文套用clean_string()資料清理、preprocess()資料轉換函式，接著使用已經訓練好的模型對推文進行預測

prev = [""]
while(1):
    for tweet in api.search(q="Biden", lang="en", rpp=10, count = 1):
        test = [f"{tweet.text}"]
        if(test[0] != prev[0]):
            print("Text:\n", test[0])
            prev = copy.deepcopy(test)
            test = pd.DataFrame(test)
            test.columns =['text']
            test.text = test.text.apply(clean_string)
            test.text = test.text.apply(preprocess)
            test_transformed = tf.transform(test.text)
            test_data = test_transformed.toarray()
            print("\nPrediction:", voting.predict(test_data)[0])
            print("-------------------------")
            time.sleep(5) 