In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import numpy.ma as ma

def call_dataset(game_name):
	# load data
	game_csv = game_name
	data = pd.read_csv(f'datasets/{game_csv}_tweets_datatset.csv',lineterminator='\n')

	data = data[data['text'].str.contains('I liked a YouTube')== False]
	data = data[data['text'].str.contains('I liked a @YouTube video')== False]
	data = data[data['text'].str.contains('I added a video to a @YouTube')== False]
	data = data[data['text'].str.contains('I added a video to a YouTube')== False]
	data = data[data['text'].str.contains('Giveaway')== False]

	# removing \r in 'sentiment'
	if 'sentiment\r' in data.columns:
		data['sentiment'] = data['sentiment\r'].apply(lambda x: x.replace('\r',''))
		data.drop(columns=['sentiment\r'],inplace=True)
	
	# removing first columns
	data.drop(data.columns[0], axis=1, inplace=True)

	# changing 'sentiment scores' from str to ndarray
	data['sentiment scores'] = data['sentiment scores'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))

	# changing 'tweet id' from float64 to str
	data['tweet id'] = data['tweet id'].apply(lambda x: str(int(x)))

	# adding sentiment scores
	sentiment_score = []
	sentiment_confidence = []
	for score in data['sentiment scores']:
		sentiment_score.append(score.argmax())
		sentiment_confidence.append(score[score.argmax()])
	
	data['sentiment score'] = sentiment_score
	data['sentiment confidence'] = sentiment_confidence

	# changing 'created at' date from str to datetime	
	data['created at'] = data['created at'].apply(lambda x: x.removesuffix('+00:00'))
	data['created at'] = data['created at'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
	data['date'] = data['created at'].apply(lambda x: datetime.date(x))
	
	# sorting from earliest to latest
	data.sort_values(by='created at', inplace=True)
	data.reset_index(drop=True, inplace=True)
	return data

In [2]:
xy = call_dataset('xy')
oras = call_dataset('oras')
sunmoon = call_dataset('sunmoon')
ultrasm = call_dataset('ultrasm')
letsgo = call_dataset('letsgo')
swsh = call_dataset('swsh')
swshdlc = call_dataset('swshdlc')
bdsp = call_dataset('bdsp')
arceus = call_dataset('arceus')
sv = call_dataset('sv')

In [3]:
def sentiment_figure(game_dataset):
    sentiment_per_day = game_dataset.groupby(['sentiment','date'], as_index=False).size()
    return sentiment_per_day

In [4]:
frames_list = [xy, oras, sunmoon, ultrasm, letsgo, swsh, swshdlc, bdsp, arceus]
all_games = pd.concat(frames_list)

In [5]:
all_games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1230286 entries, 0 to 181976
Data columns (total 9 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   tweet id              1230286 non-null  object        
 1   text                  1230286 non-null  object        
 2   created at            1230286 non-null  datetime64[ns]
 3   preprocessed tweets   1230286 non-null  object        
 4   sentiment scores      1230286 non-null  object        
 5   sentiment             1230286 non-null  object        
 6   date                  1230286 non-null  object        
 7   sentiment score       1230286 non-null  int64         
 8   sentiment confidence  1230286 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 93.9+ MB


In [6]:
import numpy.ma as ma

all_s = sentiment_figure(all_games)
only_p = all_s[all_s['sentiment']=="Positive"]
x = only_p['date']
y = only_p['size']
# plt.plot(x,y)
# plt.show()
only_p

Unnamed: 0,sentiment,date,size
3950,Positive,2013-01-05,4
3951,Positive,2013-01-06,4
3952,Positive,2013-01-07,4
3953,Positive,2013-01-08,6175
3954,Positive,2013-01-09,1786
...,...,...,...
5936,Positive,2022-01-24,1848
5937,Positive,2022-01-25,2032
5938,Positive,2022-01-26,2431
5939,Positive,2022-01-27,4946


In [7]:
min_date = all_s['date'].min()
max_date = all_s['date'].max()
date_list = pd.date_range(start=min_date, end=max_date)


In [13]:
total = 0
for variable in [xy,oras,sunmoon,ultrasm,letsgo,swsh,swshdlc,bdsp,arceus]:
    total = total + len(variable)
    print(total)

198330
265334
585764
617161
683919
886446
971196
1048309
1230286


In [16]:
for variable in [xy,oras,sunmoon,ultrasm,letsgo,swsh,swshdlc,bdsp,arceus,sv]:
    print(f'{len(variable)}')

198330
67004
320430
31397
66758
202527
84750
77113
181977
102457


In [19]:
only_positive = all_s[all_s['sentiment']=="Positive"]
only_negative = all_s[all_s['sentiment']=="Negative"]
only_neutral  = all_s[all_s['sentiment']=="Neutral"]

In [4]:
from prophet.plot import plot_plotly, plot_components_plotly
from prophet import Prophet