In [2]:
import pandas as pd
import os
from datetime import datetime
from sklearn.preprocessing import RobustScaler

In [3]:
def get_time_gap_in_minutes(time_str):
    print(time_str)
    time_obj = datetime.fromisoformat(time_str[:-1])  # Remove the 'Z' suffix
    now = datetime.now()
    time_gap_seconds = (now - time_obj).total_seconds()
    time_gap_minutes = round(time_gap_seconds / 60, 2)
    return time_gap_minutes

In [4]:
#데이터 전처리 과정

def video_preprocess(filename):
    datas = pd.read_csv(f'./DATA/videos/{filename}.csv')
    #YoutubeDATA\DATA\videos\디글_호구들의감빵생활 레전드_20230401.csv
    #C:\Hallym\Search\YoutubeDATA\DATA\videos\진격캐넌_단퐁회_20230408.csv
    
    datas = datas[datas.views != '-']
    datas['views'] = datas['views'].astype(int)
    datas

    viewScaler = RobustScaler()
    datas['views_scaled'] = viewScaler.fit_transform(datas['views'].to_numpy().reshape(-1,1))

    likeScaler = RobustScaler()
    # Scale the likes column, ignoring missing values
    datas['likes_scaled'] = datas['likes']
    datas.loc[datas['likes'].notnull(), 'likes_scaled'] = likeScaler.fit_transform(datas.loc[datas['likes'].notnull(), 'likes'].values.reshape(-1, 1))

    # Replace missing values with -1
    datas['likes_scaled'].fillna(-1, inplace=True)
    
    # calculate the date difference between rows
    datas['update_diff'] = pd.to_datetime(datas['date'], format='%Y-%m-%dT%H:%M:%SZ').diff().apply(lambda x: x.total_seconds() / 3600)
    datas['date'] = pd.to_datetime(datas['date'])
    datas['time_gap_minutes'] = (pd.Timestamp.utcnow() - datas['date']).dt.total_seconds() / 60

    # set the first date_diff value to 0
    datas.loc[datas.index[0], 'update_diff'] = 0

    datas['view_per_minutes'] = datas['views']/datas['time_gap_minutes']
    # datas['likes_per_view'] = datas['likes'] / datas.loc[datas.index[0], 'views']
    viewminScaler = RobustScaler()
    datas['vpm_scaled'] = viewminScaler.fit_transform(datas['view_per_minutes'].to_numpy().reshape(-1,1))

    datas.drop(['Unnamed: 0'],axis=1,inplace=True)

    datas.to_csv(f'./DATA/videos_preprocessed/{filename}.csv')

In [7]:
path = r".\DATA\videos"
file_lst = os.listdir(path)

for filename in file_lst:
    print(filename)
    video_preprocess(filename[:-4])

14F_4춘기_20230509.csv
1theK_내돌투어_20230509.csv
AI Jazeera English_Gold Mafia_20230509.csv
AI Jazeera English_Inside Story_20230509.csv
AJ_minecraft_20230509.csv
Alessandra_Road to Eurovision_20230509.csv
AOMG_막내84_20230509.csv
Bangtan TV_슈취타_20230509.csv
Be on Cloud_Be on Cloud Voyage_20230509.csv
Be on Cloud_Be on Game_20230509.csv
Colors of the Game_Colors of the Game_20230509.csv
Curly Tales_The Legends_20230509.csv
Davidsbeenhere_Kenya_20230509.csv
DeeALup_Mass Effect Legendary Edition_20230509.csv
Endless Adventure_50 state camper van road trip_20230509.csv
EO_실리콘밸리댄스_20230509.csv
GAMERIOT_CRIME BOSS ROCKAY CITY_20230509.csv
GLITCH_MURDER DRONE_20230509.csv
GoodTimewithScar_Limited Life_20230509.csv
Grian_Limited Life_20230509.csv
ImDontaiGaming_Resident Evil 4_20230509.csv
IQ Gaming2_Multi Player_20230509.csv
Jet Lag_New Zealand_20230509.csv
Kara and Nate_Japan Travle Vlog_20230509.csv
KBS Kpop_돌박이일_20230509.csv
KBS Kpop_리무진서비스_20230509.csv
KBS KPOP_아이돌 인간극장_20230509.csv
KBS Kpop_은

In [None]:
# read the CSV file into a Pandas DataFrame
df = pd.read_csv('./DATA/PopularRightNow/PRN_korea_20230402.csv')

# create a new DataFrame by exploding the 'tags' column
tags_df = df.explode('tags')

# group by 'tags' and aggregate the sum of 'views' and count of 'tags'
result = tags_df.groupby('tags').agg({'views': 'sum', 'tags': 'count'})

# rename the columns to more descriptive names
result = result.rename(columns={'tags': 'tag_count', 'views': 'total_views'})

# sort the DataFrame by 'total_views' in descending order
result = result.sort_values('total_views', ascending=False)


In [None]:
import pandas as pd
import ast
import numpy as np


# parse the "tags" column as a list of strings
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if x != 'Nan' else np.nan)

# drop rows with NaN values in the "tags" column
df = df.dropna(subset=['tags'])

# create a list of all tags in the DataFrame
tag_list = [tag for sublist in df['tags'] for tag in sublist]

# create a dictionary to store the value counts
value_counts = {}

# iterate over each value in the tag list
for value in tag_list:
    # add the value to the dictionary with a count of 1 if it does not already exist, or increment its count if it does
    value_counts[value] = value_counts.get(value, 0) + 1

# create a DataFrame from the dictionary of value counts
df_counts = pd.DataFrame(list(value_counts.items()), columns=['tag', 'count'])

# print the result
print(df_counts)


                   tag  count
0     YG Entertainment      2
1                   YG      3
2                  와이지      2
3                K-pop      2
4            BLACKPINK      1
...                ...    ...
2948               다이닝      1
2949            미쉐린가이드      1
2950               미쉐린      1
2951          michelin      1
2952             seoul      1

[2953 rows x 2 columns]


In [None]:
import ast

my_list = ast.literal_eval(df['tags'][0])

my_list

['YG Entertainment',
 'YG',
 '와이지',
 'K-pop',
 'BLACKPINK',
 '블랙핑크',
 '블핑',
 '제니',
 '로제',
 '리사',
 '지수',
 'LISA',
 'JISOO',
 'JENNIE',
 'ROSÉ',
 'BLINK',
 '블링크',
 '지수 꽃',
 'JISOO FLOWER',
 'FLOWER',
 'All Eyes On Me',
 'JISOO All Eyes On Me',
 '지수 All Eyes On Me',
 'JISOO ME',
 'ME',
 '지수 FLOWER',
 'JISOO 꽃']