In [10]:
from Twitter_keeper import PullTweetsData
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import re
import emoji
import numpy as np
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_stopwords
import pandas as pd

class FindTopWord(PullTweetsData):

    def tokenize(self,d):  
        result = d.split("/")
        result = list(filter(None, result))
        return result

    def prepared_Text(self,text_list):
        new_text = []
        for text in text_list:
            new_text.append(self.preprocessText(text))
        return new_text
        
    def MostWordFinder(self,tweets_list):
        vectorizer = CountVectorizer(tokenizer=self.tokenize)
        transformed_data = vectorizer.fit_transform(tweets_list)
        keyword_df1 = pd.DataFrame(columns = ['word', 'count'])
        keyword_df1['word'] = vectorizer.get_feature_names_out()
        print(vectorizer.get_feature_names_out())
        keyword_df1['count'] = np.ravel(transformed_data.sum(axis=0))   
        keyword_df1.sort_values(by=['count'], ascending=False).head(10)
        return keyword_df1



class SentimentAnalyze(PullTweetsData):

    def __init__(self):
        self.__df_train = pd.read_csv("general-amy.csv")
        self.__vectorizer = CountVectorizer()
        self.__model = MultinomialNB()

    def preprocess_train_text(self,text):
        text = self.removeLink(text)
        text = self.removeEmoji(text)
        text = self.removeSpecialChar(text)
        final = "".join(u for u in text if u not in ("?", ".", ";", ":", "!", '"', "ๆ", "ฯ"))
        final = word_tokenize(final, engine="newmm")
        final = " ".join(word for word in final)
        # final = " ".join(word for word in final.split() if word.lower not in thai_stopwords())
        return final
        # tokens = word_tokenize(text, engine="newmm")
        # result = [word for word in tokens if word not in list(
        #         thai_stopwords()) and " " not in word]
        # return " ".join(result).rstrip()

    def run_prep_train(self):
        self.__df_train['text'] = self.__df_train['text'].apply(self.preprocess_train_text)

    def split_training(self):
        self.X = self.__vectorizer.fit_transform(self.__df_train['text'])
        self.y = self.__df_train['sentiment']
        # Split the data into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2)

    def training_model(self):
        self.run_prep_train()
        self.split_training()
        # Train the model
        self.__model.fit(self.X_train, self.y_train)

    def evaluating_model(self):
        #Evaluate the model on the test data
        y_pred = self.__model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, y_pred)
        return accuracy

    def sentiment_analyzer(self,text):
        #Use the model to make predictions on new data
        new_text = self.__vectorizer.transform([text])
        new_bag_of_word = self.__vectorizer.transform(pd.Series([self.preprocess_train_text(text)]))
        # print(new_bag_of_word)
        new_pred = self.__model.predict(new_bag_of_word)
        return new_pred[0]

class main():
    def __init__(self):
        self.find_top_word = FindTopWord()
        self.sentiment_analyze = SentimentAnalyze()
        self.pull_tweets = PullTweetsData()

    def load_sample_tweets(self,author="",keyword="",hashtag="",location="",text="",fromTime="",toTime=""):
        self.pull_tweets.connectToDB("twitter_keeper","tweets_JP")
        return self.pull_tweets.find_multi(author,keyword,hashtag,location,text,fromTime,toTime)

    def tweets_find_top_word(self,author="",keyword="",hashtag="",location="",text="",fromTime="",toTime=""):
        tweets_list = self.pull_tweets.prepared_Text(self.load_sample_tweets(author,keyword,hashtag,location,text,fromTime,toTime))
        return self.find_top_word.MostWordFinder(tweets_list)

    def tweets_sentiment_analyzer(self,author="",keyword="",hashtag="",location="",text="",fromTime="",toTime=""):
        self.sentiment_analyze.training_model()
        acc = self.sentiment_analyze.evaluating_model()
        tweets_list = self.load_sample_tweets(author,keyword,hashtag,location,text,fromTime,toTime)
        df = pd.DataFrame({'text':[],'sentiment':[]})
        for tweet in tqdm(tweets_list):
            sentiment = self.sentiment_analyze.sentiment_analyzer(tweet['text'])
            df = pd.concat([df,pd.DataFrame(pd.Series([tweet['text'],sentiment], index=df.columns)).T],ignore_index=True)
        return df

TweetsCounter = main()

In [14]:
import tweepy
from dotenv import load_dotenv
import os
import pandas as pd
from Twitter_keeper import PullTweetsData
from threading import Thread
from tqdm import tqdm
from IPython.display import display,HTML

# Authenticate to Twitter
api_key = os.getenv('API_KEY')
api_key_secret = os.getenv('API_KEY_SECRET')
access_token = os.getenv('ACCESS_TOKEN')
access_token_secret = os.getenv('ACCESS_TOKEN_SECRET')

Puller = PullTweetsData()
Puller.getAccessToAPI(api_key, api_key_secret)
Puller.setUserAuthentication(access_token, access_token_secret)
Puller.getTwitterAPI()
Puller.connectToDB("twitter_keeper", "tweets_JP")

TH_Bangkok = 1225448
JP_Tokyo = 1118370


trends = Puller._PullTweetsData__api.get_place_trends(TH_Bangkok)
top50 = trends[0]['trends']
new_list = [d for d in top50 if d.get('tweet_volume') != None]
sorted_list = sorted(new_list, key=lambda x: x['tweet_volume'], reverse=True)
top10 = sorted_list[0:10]
names = [d['name'] for d in top10]
print(names)
pd.DataFrame(top10)

# t1 = Thread(target=Puller.pullTweets, args=("#Valentine", 100))
# t1.start()

for i in tqdm(names):
    print(i)
    t1 = Thread(target=Puller.pullTweets, args=(i, 1))
    t1.start()
# names = ['#APOMeetandGive2023', '#Nnattawin', '#GenshinImpact', '#FreenBeckyXHOKAClifton9', '#สบู่ทับทิมนัมเบอร์วันXอิงล็อต', '#jhopeINTHEBOX', '#WelcomeJINYOUNGtoThailand', '#JDENTXZeeNunew', '#DestinyClinicYinWar', '#beckysangels']

def dfTitle(name):
    df = pd.DataFrame(
        {
            "text": [name],
            "sentiment": ["None"]
        },
        index=[0],
    )
    return df

TweetsCounter = main()
resultSenti = pd.DataFrame({'text':[],'sentiment':[]})
for i in tqdm(names):
    df = TweetsCounter.tweets_sentiment_analyzer(text="",keyword=i)
    dfTitle(i)
    resultSenti = pd.concat([resultSenti, dfTitle(i)])
    resultSenti = pd.concat([resultSenti, df])
resultSenti



['#APOMeetandGive2023', '#Nnattawin', '#GenshinImpact', '#FreenBeckyXHOKAClifton9', '#สบู่ทับทิมนัมเบอร์วันXอิงล็อต', '#jhopeINTHEBOX', '#WelcomeJINYOUNGtoThailand', '#DestinyClinicYinWar', '#JDENTXZeeNunew', '#beckysangels']


100%|██████████| 10/10 [00:00<00:00, 802.31it/s]


#APOMeetandGive2023
#Nnattawin
#GenshinImpact
#FreenBeckyXHOKAClifton9
#สบู่ทับทิมนัมเบอร์วันXอิงล็อต
#jhopeINTHEBOX
#WelcomeJINYOUNGtoThailand
#DestinyClinicYinWar
#JDENTXZeeNunew
#beckysangels


100%|██████████| 1/1 [00:00<00:00, 984.35it/s]
0it [00:00, ?it/s]/10 [00:02<00:26,  2.96s/it]
100%|██████████| 1/1 [00:00<00:00, 1550.57it/s]
0it [00:00, ?it/s]/10 [00:09<00:22,  3.27s/it]
100%|██████████| 1/1 [00:00<00:00, 1110.78it/s]
100%|██████████| 1/1 [00:00<00:00, 1283.84it/s]
100%|██████████| 1/1 [00:00<00:00, 1024.75it/s]
100%|██████████| 1/1 [00:00<00:00, 1293.34it/s]
100%|██████████| 1/1 [00:00<00:00, 1420.35it/s]
100%|██████████| 1/1 [00:00<00:00, 1335.34it/s]
100%|██████████| 10/10 [00:51<00:00,  5.14s/it]


Unnamed: 0,text,sentiment
0,#APOMeetandGive2023,
0,วันนี้ไม่ได้เล่นเเท็กเลย โปน่ารักมากก ขอบคุณอะ...,neg
0,#Nnattawin,
0,#GenshinImpact,
0,venti in pink:) \n\n#venti #GenshinImpact #原神 ...,pos
0,#FreenBeckyXHOKAClifton9,
0,#สบู่ทับทิมนัมเบอร์วันXอิงล็อต,
0,กรี้ดๆๆๆอยู่ไม่ได้แล้วบีบแก้มกันด้วยอ่ะเขินเวอ...,pos
0,#jhopeINTHEBOX,
0,&lt;#jhopeINTHEBOX&gt; documentary VOD is now ...,neg


In [None]:
from IPython.display import display,HTML

df = TweetsCounter.tweets_sentiment_analyzer(text="",keyword="#ใส่นัวแฟมิลี่")
df

In [4]:
df1 = pd.DataFrame(
    {
        "text": ["A0", "A1", "A2", "A3"],
        "sentiment": ["B0", "B1", "B2", "B3"]
    },
    index=[0, 1, 2, 3],
)


df2 = pd.DataFrame(
    {
        "text": ["A4", "A5", "A6", "A7"],
        "sentiment": ["B4", "B5", "B6", "B7"]
    },
    index=[4, 5, 6, 7],
)


df3 = pd.DataFrame(
    {
        "text": ["A8", "A9", "A10", "A11"],
        "sentiment": ["B8", "B9", "B10", "B11"]
    },
    index=[8, 9, 10, 11],
)

result = pd.DataFrame({'text':[],'sentiment':[]})
frames = [df1, df2, df3]
result = pd.concat([result, df1])
result = pd.concat([result, df2])
result

Unnamed: 0,text,sentiment
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4
5,A5,B5
6,A6,B6
7,A7,B7
