In [4]:
import pandas as pd
import numpy as np
import collections

In [10]:
file = "data/yelp/raw_train.csv"
data = pd.read_csv(file,header=None, names=['rating', 'review'])

In [11]:
data.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [12]:
data.shape

(560000, 2)

In [31]:
data["rating"].value_counts()

rating
1    19000
2    19000
Name: count, dtype: int64

In [33]:
# rating 순으로 정렬된 딕셔너리 생성
by_rating = collections.defaultdict(list) # 딕셔너리 생성
for _, row in data.iterrows():
    by_rating[row.rating].append(row.to_dict()) # 레이팅 별로 딕셔너리에 삽입 / 각 세트에 긍정/부정 리뷰가 동일한 비율로 들어가도록

In [34]:
by_rating

defaultdict(list,
            {1: [{'rating': 1,
               'review': "Ordered a large Mango-Pineapple smoothie. Stayed in line 5 minutes from the time I ordered. Which was okay, I made sure I started out 10 minutes early for the trip. $3.64 for a large. Got to the window stayed an additional 3 minutes. No big deal. Until...... Ma'am We no have Mango. We have Um.. Strawberry. Reluctantly I said Ok... It's just Strawberry right? Not Strawberry-Banana? Si Strawberry- Banana. Side note...( I don't like Strawberry- Banana) No ma'am I would just like my money back... refund please... she gave me $3.53 back? Wasted my time and made a profit! Geez!!"},
              {'rating': 1,
               'review': "First I will say, this is a nice atmosphere and the food was fine. But for the price, the portion size, the $2 upcharge for brown rice (on top of the price for the lunch) and the very slow service at lunch, this is what brings me to the lower rating. \\n\\nI've been to other Thai places 

In [36]:
# 서브셋 생성 
review_subset = []

for _, item_list in sorted(by_rating.items()):  # rating 순으로 정렬된 딕셔너리에서 item만 반복 / item은 rating과 review로 이루어져 있음
    n_total = len(item_list) # 리뷰의 전체 길이 구하기
    n_subset = int(0.1 * n_total) # 리뷰의 길이와 서브셋 비율을 곱해 서브셋 크기 결정
    review_subset.extend(item_list[:n_subset]) # 무슨 역할일까?

review_subset = pd.DataFrame(review_subset) # 데이터 셋으로 변형

In [37]:
review_subset

Unnamed: 0,rating,review
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,1,"First I will say, this is a nice atmosphere an..."
2,1,Video link at bottom review. Worst service I h...
3,1,Awful! Floor was so dirty that my shoes almost...
4,1,"Wasn't impressed. Long line outside, and seei..."
...,...,...
3795,2,Really a nice little find. Four stars for the ...
3796,2,I've been 3 times to Castello coffee. It has t...
3797,2,"Good and fast... Plus affordable, but I can o..."
3798,2,For all the people who sit down and don't get ...


In [None]:
import collections
import numpy as np
import pandas as pd
import re

def _clean_text(text):
    # 텍스트 정규식으로 전처리
    text = text.lower() # 소문자로 변환
    text = re.sub(r"([.,!?])", r" \1 ", text) # 특수문자 혹은 구두점 발견 시 양옆에 공백 추가
    text = re.sub(r"[^a-zA-z.,!?]+", r" ", text) # 허용되지 않은 문자들을 공백으로 치환 

def load_and_preprocess_data(raw_csv, args):
    # 원본 파일 로드, 전처리 및 분할 수행, 최종 데이터 프레임 반환
    data = pd.read_csv(raw_csv, header=None, names=['rating', 'review']) # 원본 데이터 읽어 오기

    # rating 순으로 정렬된 딕셔너리 생성
    by_rating = collections.defaultdict(list) # 딕셔너리 생성
    for _, row in data.iterrows():
        by_rating[row.rating].append(row.to_dict()) # 레이팅 별로 딕셔너리에 삽입 / 각 세트에 긍정/부정 리뷰가 동일한 비율로 들어가도록
    
    # 서브셋 생성 
    review_subset = []

    for _, item_list in sorted(by_rating.items()):  # by_rating.items()에는 부정과 긍정에 대한 리뷰 딕셔너리 리스트가 있음.
        n_total = len(item_list) # 리뷰의 전체 길이 구하기
        n_subset = int(args.proportion_subset_of_train * n_total) # 리뷰의 길이와 서브셋 비율을 곱해 서브셋 크기 결정
        review_subset.extend(item_list[:n_subset]) # n_subset 개수만큼의 리뷰를 잘라내 review_subset에 하나씩 추가

    review_subset = pd.DataFrame(review_subset) # 데이터 셋으로 변형
    
    # 데이터 분할
    by_rating = collections.defaultdict(list) # 딕셔너리 생성
    for _, row in review_subset.iterrows(): # 리뷰 딕셔너리의 각 행 반복
        by_rating[row.rating].append(row.to_dict()) # 딕셔너리에 레이팅 별로 삽입
    
    final_list = [] # 리스트 생성
    np.random.seed(args.seed) # 동일한 실험 결과를 위해 시드 설정

    for _, item_list in sorted(by_rating.items()): # by_rating.items() 안의 레이팅과 리뷰 반복
        np.random.shuffle(item_list) # 데이터 무작위로 섞음

        n_total = len(item_list)
        n_train = int(args.train_proportion * n_total)
        n_val = int(args.val_proportion * n_total)
        n_test = int(args.test_proportion * n_total)

        for item in item_list[:n_train]: # n_train까지 데이터 잘라서 'train' 레이블 추가
            item['split'] = 'train'
        
        for item in item_list[n_train:n_train+n_val]: # n_train 부터 




    





In [1]:
import torch
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"PyTorch CUDA Version: {torch.version.cuda}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

PyTorch Version: 2.7.1+cpu
CUDA Available: False


In [1]:
import torch
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

PyTorch Version: 2.7.1+cu118
CUDA Available: True
