In [73]:
!pip install wordcloud



In [74]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
import nltk
import re
from collections import Counter
from wordcloud import WordCloud

### 데이터셋 로드

In [76]:
# 데이터셋
df = pd.read_csv("/content/drive/MyDrive/빅데이터/project/reddit_ai_data.csv")

In [77]:
df.head()

Unnamed: 0,subreddit,title,body,created_date
0,ChatGPT,Forming Relationships and dating an AI: Part Two,This is a follow up to my first post \n[https...,2024-12-22 01:43:59
1,ChatGPT,AI can't be random?,"In both, ChatGPT and Gemini, I asked ""Tell me ...",2024-12-22 01:24:55
2,ChatGPT,The moral dilemma of AI powered by nuclear energy,I recently learned something that I am having ...,2024-12-21 23:11:08
3,ChatGPT,AI Incest is a thing,I did now find out about AI incest. For those ...,2024-12-21 23:04:45
4,ChatGPT,How AI Helped Me Navigate a Misdiagnosis.,I know some people post fake “how AI helped me...,2024-12-21 22:50:53


### 전처리

##### 데이터 확인 및 타입 변환

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19418 entries, 0 to 19417
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   subreddit     19418 non-null  object
 1   title         19418 non-null  object
 2   body          19418 non-null  object
 3   created_date  19418 non-null  object
dtypes: object(4)
memory usage: 606.9+ KB


In [79]:
# datetime 변환
df['created_date'] = pd.to_datetime(df['created_date'])

In [80]:
# 연도와 월 파생변수 생성
df['year'] = df['created_date'].dt.year
df['month'] = df['created_date'].dt.month

In [81]:
df.head()

Unnamed: 0,subreddit,title,body,created_date,year,month
0,ChatGPT,Forming Relationships and dating an AI: Part Two,This is a follow up to my first post \n[https...,2024-12-22 01:43:59,2024,12
1,ChatGPT,AI can't be random?,"In both, ChatGPT and Gemini, I asked ""Tell me ...",2024-12-22 01:24:55,2024,12
2,ChatGPT,The moral dilemma of AI powered by nuclear energy,I recently learned something that I am having ...,2024-12-21 23:11:08,2024,12
3,ChatGPT,AI Incest is a thing,I did now find out about AI incest. For those ...,2024-12-21 23:04:45,2024,12
4,ChatGPT,How AI Helped Me Navigate a Misdiagnosis.,I know some people post fake “how AI helped me...,2024-12-21 22:50:53,2024,12


##### Tokenization

In [86]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [87]:
# 표제어 추출 함수
def lemmatize_text(tagged_text):
    lemmatizer = WordNetLemmatizer()
    result = []
    for token, tag in tagged_text:
        if tag.startswith('N'):  # 명사
            lemma = lemmatizer.lemmatize(token, pos='n')
        elif tag.startswith('V'):  # 동사
            lemma = lemmatizer.lemmatize(token, pos='v')
        elif tag.startswith('J'):  # 형용사
            lemma = lemmatizer.lemmatize(token, pos='a')
        elif tag.startswith('R'):  # 부사
            lemma = lemmatizer.lemmatize(token, pos='r')
        else:
            lemma = lemmatizer.lemmatize(token)
        result.append(lemma)
    return result

# 불용어 제거 함수
def remove_stopword(tokenized_text, stopwords):
    return [token for token in tokenized_text if token not in stopwords]

# body 열 처리 함수
def process_text(text):
    text = text.lower()  # 소문자 변환
    tokens = word_tokenize(text)  # 토큰화
    stopword_lst = set(stopwords.words('english'))  # 불용어 리스트
    tokens = remove_stopword(tokens, stopword_lst)  # 불용어 제거
    # 링크, 숫자 및 특수 문자 제거
    tokens = [re.sub(r'http\S+|www.\S+|\d+|[^a-zA-Z]', '', token) for token in tokens]
    tokens = [token for token in tokens if token]  # 빈 문자열 제거
    pos_tags = pos_tag(tokens)  # 품사 태깅
    lemma_result = lemmatize_text(pos_tags)  # 표제어 추출
    return lemma_result

In [89]:
# body 열에 전처리 적용하여 keyword 열 생성
df['keyword'] = df['body'].apply(process_text)

# 결과 확인
df.head()

Unnamed: 0,subreddit,title,body,created_date,year,month,keyword
0,ChatGPT,Forming Relationships and dating an AI: Part Two,This is a follow up to my first post \n[https...,2024-12-22 01:43:59,2024,12,"[follow, first, post, okay, wow, begin, lots, ..."
1,ChatGPT,AI can't be random?,"In both, ChatGPT and Gemini, I asked ""Tell me ...",2024-12-22 01:24:55,2024,12,"[chatgpt, gemini, ask, tell, random, number, a..."
2,ChatGPT,The moral dilemma of AI powered by nuclear energy,I recently learned something that I am having ...,2024-12-21 23:11:08,2024,12,"[recently, learn, something, trouble, settle, ..."
3,ChatGPT,AI Incest is a thing,I did now find out about AI incest. For those ...,2024-12-21 23:04:45,2024,12,"[find, ai, incest, know, refer, little, explan..."
4,ChatGPT,How AI Helped Me Navigate a Misdiagnosis.,I know some people post fake “how AI helped me...,2024-12-21 22:50:53,2024,12,"[know, people, post, fake, ai, help, story, th..."


In [91]:
# 1글자 단어 제거 함수
def remove_single_char_words(tokens):
    return [word for word in tokens if len(word) > 1]

# keyword 열에서 1글자 단어 제거
df['keyword'] = df['keyword'].apply(remove_single_char_words)

##### 데이터 저장

In [93]:
df.to_csv("/content/drive/MyDrive/빅데이터/project/reddit_ai_data_cleaned.csv", index=False)