### GPT API를 사용하여 감정 라벨링 시도

In [30]:
import pandas as pd
df = pd.read_csv('beer_111925.csv')
df

Unnamed: 0,Rating,Review,Beer_name,label
0,0.5,This is like Budlight but with slight corn tas...,Milwaukee's Best Light,Negative
1,1.0,Strong corn flavor. Highly carbonated and no h...,Milwaukee's Best Light,Negative
2,0.5,It just doesn't get worse than this. Brings me...,Milwaukee's Best Light,Negative
3,0.6,Beast Bleu. I shutter to think how many cans ...,Milwaukee's Best Light,Negative
4,1.2,I wouldn't wish this beer on anyones glass. T...,Milwaukee's Best Light,Negative
...,...,...,...,...
111920,0.8,"This beer is nothing... no taste, no kick, no ...",Milwaukee's Best Light,Negative
111921,0.5,ABSOLUTELY DREADFUL!! MY ROOMMATES USED TO BU...,Milwaukee's Best Light,Negative
111922,0.5,Well here is my third and final rating in the ...,Milwaukee's Best Light,Negative
111923,1.1,beast light... the staple of any good college ...,Milwaukee's Best Light,Negative


In [23]:
# 중복값 제거
df.drop_duplicates(subset='Review', inplace = True)

In [25]:
len(df)

111925

In [14]:
beer_name = df['Beer_name'].value_counts()
beer_name_list = beer_name.tolist()

In [84]:
df_sample = df.sample(n=4000) # 샘플링 4,000개 사용

In [85]:
df_sample['label'].value_counts()

Negative    1966
Positive    1317
Neutral      717
Name: label, dtype: int64

In [60]:
import openai

openai.api_key = ""

try:
    models = openai.Model.list()
    print("API key is valid")
except Exception as e:
    print("API key is not valid:", e)

API key is valid


In [61]:
# Review 데이터를 positive, neutral, negative로 라벨링하는 함수
def label_review(review):
    prompt = f"Please classify the following review as positive, neutral, or negative.\nReview: {review}\nSentiment:"
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=1,
        n=1,
        stop=None,
        temperature=0.7,
    )
    sentiment = response.choices[0].text.strip().lower()
    if sentiment == "positive":
        return "Positive"
    elif sentiment == "negative":
        return "Negative"
    else:
        return "Neutral"

In [62]:
from tqdm.notebook import tqdm

df['gpt_label'] = tqdm(df['Review'].apply(label_review), desc="Labeling Reviews")

Labeling Reviews:   0%|          | 0/1000 [00:00<?, ?it/s]

In [87]:
# df_sample.to_csv('gpt_labling_4000_v2.csv', index = False)

In [46]:
import pandas as pd

df_gpt = pd.read_csv('gpt_labling_4000_v2.csv')
df_gpt

Unnamed: 0,Rating,Review,Beer_name,label,gpt_label
0,4.7,Good god. How the hell is this a shelf beer? V...,AleSmith Speedway Stout,Negative,Positive
1,1.3,"Looks like piss, but has a heavier taste than ...",Bud Ice,Negative,Negative
2,1.8,"To watery, bitter...wouldn't suggest. This is ...",Heineken,Negative,Negative
3,1.0,On Tap. Poured a nice dark brown color with a ...,Guinness Draught,Negative,Negative
4,2.1,Easy going and efficient in its distribution o...,Kirin Ichiban,Neutral,Neutral
...,...,...,...,...,...
3995,3.8,"yes, is till needed to taste this one. i taste...",Hoegaarden,Neutral,Positive
3996,1.6,Had this at a local all you can eat buffet. N...,Tsingtao,Negative,Negative
3997,1.8,Pours a pale golden yellow color with a one fi...,Corona Light,Neutral,Negative
3998,3.7,"Tall can.\nCloudy yellow, with a thin white he...",Toppling Goliath King Sue,Positive,Positive


In [47]:
df_gpt.drop_duplicates(subset='Review', inplace = True)
len(df_gpt)

3984

In [48]:
# vader_labeling = df_test['result'].value_counts()
gpt_labeling = df_gpt['gpt_label'].value_counts()

# print(vader_labeling)
print(gpt_labeling)

Positive    1942
Negative    1366
Neutral      676
Name: gpt_label, dtype: int64


In [51]:
df_gpt.rename(columns = {'gpt_label' : 'MultinomialNB_label'}, inplace = True)

In [31]:
# 원본에서 gpt sample을 제외하고 추출한다.
# 이 데이터는 unlabeled_data이다.

diff_df = df[~df['Review'].isin(df_gpt['Review'])]
len(diff_df)

107941

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.semi_supervised import SelfTrainingClassifier

# 데이터셋 로드
unlabeled_data = diff_df

# 라벨링이 된 데이터셋 로드
labeled_data = df_gpt

# TF-IDF 벡터화
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(labeled_data['Review'])
y_train = labeled_data['gpt_label']

# Naive Bayes 모델 정의
model = MultinomialNB(alpha=0.1)

# Self-training 방법으로 모델 학습
self_training_model = SelfTrainingClassifier(model, max_iter=1000)
self_training_model.fit(X_train, y_train)

# 자동 라벨링을 위한 데이터셋 벡터화
X_unlabeled = vectorizer.transform(unlabeled_data['Review'])

# 모델이 높은 확률로 예측한 샘플에 대해 자동으로 라벨 부여
predicted_labels = self_training_model.predict(X_unlabeled)
unlabeled_data['MultinomialNB_label'] = predicted_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_data['MultinomialNB_label'] = predicted_labels


In [37]:
print(unlabeled_data['MultinomialNB_label'].value_counts())
print(len(unlabeled_data))

Positive    64021
Negative    40520
Neutral      3400
Name: MultinomialNB_label, dtype: int64
107941


In [52]:
df_gpt

Unnamed: 0,Rating,Review,Beer_name,label,MultinomialNB_label
0,4.7,Good god. How the hell is this a shelf beer? V...,AleSmith Speedway Stout,Negative,Positive
1,1.3,"Looks like piss, but has a heavier taste than ...",Bud Ice,Negative,Negative
2,1.8,"To watery, bitter...wouldn't suggest. This is ...",Heineken,Negative,Negative
3,1.0,On Tap. Poured a nice dark brown color with a ...,Guinness Draught,Negative,Negative
4,2.1,Easy going and efficient in its distribution o...,Kirin Ichiban,Neutral,Neutral
...,...,...,...,...,...
3995,3.8,"yes, is till needed to taste this one. i taste...",Hoegaarden,Neutral,Positive
3996,1.6,Had this at a local all you can eat buffet. N...,Tsingtao,Negative,Negative
3997,1.8,Pours a pale golden yellow color with a one fi...,Corona Light,Neutral,Negative
3998,3.7,"Tall can.\nCloudy yellow, with a thin white he...",Toppling Goliath King Sue,Positive,Positive


In [53]:
# vader_labeling = df_test['result'].value_counts()
gpt_labeling = df_gpt['MultinomialNB_label'].value_counts()

# print(vader_labeling)
print(gpt_labeling)

Positive    1942
Negative    1366
Neutral      676
Name: MultinomialNB_label, dtype: int64


In [45]:
unlabeled_data

Unnamed: 0,Rating,Review,Beer_name,label,MultinomialNB_label
0,0.5,This is like Budlight but with slight corn tas...,Milwaukee's Best Light,Negative,Negative
1,1.0,Strong corn flavor. Highly carbonated and no h...,Milwaukee's Best Light,Negative,Negative
2,0.5,It just doesn't get worse than this. Brings me...,Milwaukee's Best Light,Negative,Negative
3,0.6,Beast Bleu. I shutter to think how many cans ...,Milwaukee's Best Light,Negative,Negative
4,1.2,I wouldn't wish this beer on anyones glass. T...,Milwaukee's Best Light,Negative,Negative
...,...,...,...,...,...
111920,0.8,"This beer is nothing... no taste, no kick, no ...",Milwaukee's Best Light,Negative,Negative
111921,0.5,ABSOLUTELY DREADFUL!! MY ROOMMATES USED TO BU...,Milwaukee's Best Light,Negative,Negative
111922,0.5,Well here is my third and final rating in the ...,Milwaukee's Best Light,Negative,Negative
111923,1.1,beast light... the staple of any good college ...,Milwaukee's Best Light,Negative,Negative


In [54]:
total_df = pd.concat([unlabeled_data, df_gpt]).drop_duplicates(keep=False)
len(total_df)

111925

In [55]:
total_df

Unnamed: 0,Rating,Review,Beer_name,label,MultinomialNB_label
0,0.5,This is like Budlight but with slight corn tas...,Milwaukee's Best Light,Negative,Negative
1,1.0,Strong corn flavor. Highly carbonated and no h...,Milwaukee's Best Light,Negative,Negative
2,0.5,It just doesn't get worse than this. Brings me...,Milwaukee's Best Light,Negative,Negative
3,0.6,Beast Bleu. I shutter to think how many cans ...,Milwaukee's Best Light,Negative,Negative
4,1.2,I wouldn't wish this beer on anyones glass. T...,Milwaukee's Best Light,Negative,Negative
...,...,...,...,...,...
3995,3.8,"yes, is till needed to taste this one. i taste...",Hoegaarden,Neutral,Positive
3996,1.6,Had this at a local all you can eat buffet. N...,Tsingtao,Negative,Negative
3997,1.8,Pours a pale golden yellow color with a one fi...,Corona Light,Neutral,Negative
3998,3.7,"Tall can.\nCloudy yellow, with a thin white he...",Toppling Goliath King Sue,Positive,Positive


In [58]:
total_df['label'].value_counts()

Negative    55934
Positive    36897
Neutral     19094
Name: label, dtype: int64

In [57]:
total_df.to_csv('MultinomialNB_label.csv', index = False)