In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
import os, glob

# ⚠️ 폴더 이름에 공백이 있어서 끝에 스페이스까지 포함해야 함!
BASE_DIR = "/content/drive/MyDrive/ai_text "

# 폴더 안 파일 확인
!ls -lh "$BASE_DIR"


total 12M
-rw------- 1 root root  12K Nov 30 16:53 ai-text-detector.ipynb
-rw------- 1 root root 9.1M Oct 31  2023 daigt_external_dataset.csv
-rw------- 1 root root 2.9M Nov 25 12:13 daigt_external_dataset.csv.zip


In [6]:
!pip install -q torch torchvision torchaudio
!pip install -q scikit-learn pandas


In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


In [8]:
CSV_PATH = os.path.join(BASE_DIR, "daigt_external_dataset.csv")
print("CSV_PATH:", CSV_PATH)
assert os.path.exists(CSV_PATH), "CSV 파일 경로가 잘못됐어요. 이름/폴더 다시 확인!"

df_raw = pd.read_csv(CSV_PATH)
print(df_raw.head())
print(df_raw.columns)



CSV_PATH: /content/drive/MyDrive/ai_text /daigt_external_dataset.csv
             id                                               text  \
0  6060D28C05B6  Some schools in United States ofter classes fr...   
1  60623DB5DE7A  Four-day work week, a remarkable idea to conse...   
2  607A39D981DE  Students and their families should consider an...   
3  60ACDFA1609E  Agree you will never grow if something beyond ...   
4  60AE13D3F07B  I think our character traits are formed by inf...   

                                        instructions  \
0  \nTask: Write a persuasive essay on whether or...   
1  \nTask: Research the advantages and disadvanta...   
2  \nTask: \n\n1. Talk to your parents before tak...   
3  \nTask: Write an essay discussing the benefits...   
4  \nTask: Research and discuss how character tra...   

                                         source_text  
0  \nWhen considering the pros and cons of attend...  
1  \nOne of the primary arguments for implementin...  
2  \nBef

In [9]:
# 컬럼 존재 여부 체크
assert "text" in df_raw.columns and "source_text" in df_raw.columns, "text / source_text 컬럼이 없어요!"

human_texts = df_raw["text"].astype(str).tolist()
ai_texts    = df_raw["source_text"].astype(str).tolist()

print("Human samples:", len(human_texts))
print("AI samples   :", len(ai_texts))

df = pd.DataFrame({
    "text":  human_texts + ai_texts,
    "label": [0]*len(human_texts) + [1]*len(ai_texts)   # 0=Human, 1=AI
})

# 셔플
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()


Human samples: 2421
AI samples   : 2421


Unnamed: 0,text,label
0,\nRegular inactive times are integral to our o...,1
1,\n\nOne famous quote by Michelangelo that rela...,1
2,\nAttending online classes or video conferenci...,1
3,\nMaking your own decisions in life has many a...,1
4,"Dear Principle,\n\nchanging the policy is a go...",0


In [10]:
texts  = df["text"].tolist()
labels = df["label"].tolist()

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

print(len(train_texts), len(test_texts))


3873 969


In [11]:
max_features = 2000  # TF-IDF 차원 수 (GPU/속도 보고 조절 가능)

vectorizer = TfidfVectorizer(
    max_features=max_features,
    stop_words=None  # 'english'로 바꾸면 영어 불용어 제거
)

X_train = vectorizer.fit_transform(train_texts).toarray()
X_test  = vectorizer.transform(test_texts).toarray()

y_train = np.array(train_labels)
y_test  = np.array(test_labels)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)


Train shape: (3873, 2000)
Test shape : (969, 2000)


In [12]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float()  # BCE 때문에 float

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TextDataset(X_train, y_train)
test_dataset  = TextDataset(X_test, y_test)

batch_size = 128  # GPU 메모리 상황 보고 조절

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

len(train_dataset), len(test_dataset)


(3873, 969)

In [13]:
latent_dim  = 100
feature_dim = max_features

class Generator(nn.Module):
    def __init__(self, latent_dim, feature_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, feature_dim),
            nn.Tanh()
        )

    def forward(self, z):
        return self.model(z)


class Discriminator(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()

        self.feature_extractor = nn.Sequential(
            nn.Linear(feature_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2),
        )

        # 1) real / fake 판별
        self.adv_head = nn.Sequential(
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

        # 2) AI vs Human 분류 (0=Human, 1=AI)
        self.cls_head = nn.Sequential(
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        h = self.feature_extractor(x)
        validity = self.adv_head(h)
        cls = self.cls_head(h)
        return validity, cls

G = Generator(latent_dim, feature_dim).to(device)
D = Discriminator(feature_dim).to(device)

criterion_adv = nn.BCELoss()
criterion_cls = nn.BCELoss()

optimizer_G = optim.Adam(G.parameters(), lr=2e-4, betas=(0.5, 0.999))
optimizer_D = optim.Adam(D.parameters(), lr=2e-4, betas=(0.5, 0.999))

print(G)
print(D)


Generator(
  (model): Sequential(
    (0): Linear(in_features=100, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=2000, bias=True)
    (5): Tanh()
  )
)
Discriminator(
  (feature_extractor): Sequential(
    (0): Linear(in_features=2000, out_features=512, bias=True)
    (1): LeakyReLU(negative_slope=0.2)
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): LeakyReLU(negative_slope=0.2)
  )
  (adv_head): Sequential(
    (0): Linear(in_features=256, out_features=1, bias=True)
    (1): Sigmoid()
  )
  (cls_head): Sequential(
    (0): Linear(in_features=256, out_features=1, bias=True)
    (1): Sigmoid()
  )
)


In [14]:
num_epochs = 20  # 처음엔 5 정도로 돌려보고 괜찮으면 늘리기

for epoch in range(num_epochs):
    G.train()
    D.train()

    g_loss_epoch = 0.0
    d_loss_epoch = 0.0

    for real_feats, labels in train_loader:
        real_feats = real_feats.to(device)
        labels = labels.to(device).view(-1, 1)

        bs = real_feats.size(0)

        # ---------- Discriminator 학습 ----------
        optimizer_D.zero_grad()

        valid = torch.ones((bs, 1), device=device)
        fake  = torch.zeros((bs, 1), device=device)

        # (a) real 데이터
        adv_real, cls_real = D(real_feats)
        loss_adv_real = criterion_adv(adv_real, valid)
        loss_cls_real = criterion_cls(cls_real, labels)

        # (b) fake 데이터
        z = torch.randn((bs, latent_dim), device=device)
        fake_feats = G(z)
        adv_fake, _ = D(fake_feats.detach())
        loss_adv_fake = criterion_adv(adv_fake, fake)

        # D 총 loss
        loss_D = loss_adv_real + loss_adv_fake + loss_cls_real
        loss_D.backward()
        optimizer_D.step()

        # ---------- Generator 학습 ----------
        optimizer_G.zero_grad()

        z = torch.randn((bs, latent_dim), device=device)
        gen_feats = G(z)
        adv_gen, _ = D(gen_feats)

        # G는 fake를 real로 보이게 하고 싶음 → target=valid
        loss_G = criterion_adv(adv_gen, valid)
        loss_G.backward()
        optimizer_G.step()

        g_loss_epoch += loss_G.item() * bs
        d_loss_epoch += loss_D.item() * bs

    g_loss_epoch /= len(train_dataset)
    d_loss_epoch /= len(train_dataset)
    print(f"[Epoch {epoch+1}/{num_epochs}] D_loss: {d_loss_epoch:.4f} | G_loss: {g_loss_epoch:.4f}")


[Epoch 1/20] D_loss: 2.0606 | G_loss: 0.7078
[Epoch 2/20] D_loss: 2.0388 | G_loss: 0.7136
[Epoch 3/20] D_loss: 1.9308 | G_loss: 0.7215
[Epoch 4/20] D_loss: 1.7065 | G_loss: 0.7377
[Epoch 5/20] D_loss: 1.5450 | G_loss: 0.7456
[Epoch 6/20] D_loss: 1.4802 | G_loss: 0.7516
[Epoch 7/20] D_loss: 1.4477 | G_loss: 0.7522
[Epoch 8/20] D_loss: 1.4120 | G_loss: 0.7828
[Epoch 9/20] D_loss: 1.4088 | G_loss: 0.7891
[Epoch 10/20] D_loss: 1.3933 | G_loss: 0.8115
[Epoch 11/20] D_loss: 1.3899 | G_loss: 0.8244
[Epoch 12/20] D_loss: 1.3858 | G_loss: 0.8574
[Epoch 13/20] D_loss: 1.3911 | G_loss: 0.8248
[Epoch 14/20] D_loss: 1.3665 | G_loss: 0.8758
[Epoch 15/20] D_loss: 1.3887 | G_loss: 0.8544
[Epoch 16/20] D_loss: 1.3969 | G_loss: 0.8638
[Epoch 17/20] D_loss: 1.3767 | G_loss: 0.8846
[Epoch 18/20] D_loss: 1.3643 | G_loss: 0.8340
[Epoch 19/20] D_loss: 1.3740 | G_loss: 0.8511
[Epoch 20/20] D_loss: 1.3696 | G_loss: 0.8700


In [15]:
D.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for feats, labels in test_loader:
        feats = feats.to(device)
        labels = labels.to(device).view(-1, 1)

        _, cls_out = D(feats)
        preds = (cls_out > 0.5).float()

        all_preds.extend(preds.cpu().numpy().flatten().tolist())
        all_labels.extend(labels.cpu().numpy().flatten().tolist())

print(classification_report(all_labels, all_preds, digits=4))


              precision    recall  f1-score   support

         0.0     0.9959    0.9979    0.9969       485
         1.0     0.9979    0.9959    0.9969       484

    accuracy                         0.9969       969
   macro avg     0.9969    0.9969    0.9969       969
weighted avg     0.9969    0.9969    0.9969       969



In [16]:
def predict_text_origin(text: str):
    # 1) TF-IDF 벡터 변환
    vec = vectorizer.transform([text]).toarray()
    feat = torch.from_numpy(vec).float().to(device)

    D.eval()
    with torch.no_grad():
        _, cls_out = D(feat)
        prob_ai = cls_out.item()
        prob_human = 1.0 - prob_ai

    return {
        "ai_prob": float(prob_ai),
        "human_prob": float(prob_human),
        "label": "AI" if prob_ai > 0.5 else "Human"
    }

test_sentence = "Large language models are trained on huge datasets of text to generate human-like responses."
print(predict_text_origin(test_sentence))


{'ai_prob': 0.6022042036056519, 'human_prob': 0.39779579639434814, 'label': 'AI'}


In [17]:
import pandas as pd

# 예시 문장들 (원하면 네가 맘대로 수정해도 됨)
example_texts = [
    # 사람 글 느낌
    "오늘은 학교에서 운영체제 수업을 듣고 프로세스 스케줄링 내용을 정리했다. 이론만 들을 때는 어려웠는데, 직접 예제를 풀어보니까 훨씬 이해가 잘 됐다.",
    "이번 학기에 캡스톤 디자인을 들으면서 실제 서비스처럼 기획부터 배포까지 해보는 경험을 하고 있다. 시간은 많이 들지만 확실히 공부가 되는 것 같다.",

    # AI 글 느낌
    "인공지능은 현대 사회 전반에 걸쳐 급속도로 확산되고 있으며, 특히 자연어 처리 기술의 발달은 인간과 컴퓨터 간의 상호작용 방식을 근본적으로 변화시키고 있다.",
    "대규모 언어 모델은 방대한 양의 텍스트 데이터를 기반으로 학습되며, 이를 통해 문장 생성, 요약, 번역 등 다양한 작업에서 인간에 가까운 성능을 보여준다."
]

df_example = pd.DataFrame({"text": example_texts})
df_example.to_csv("example_texts.csv", index=False)

df_example


Unnamed: 0,text
0,오늘은 학교에서 운영체제 수업을 듣고 프로세스 스케줄링 내용을 정리했다. 이론만 들...
1,이번 학기에 캡스톤 디자인을 들으면서 실제 서비스처럼 기획부터 배포까지 해보는 경험...
2,"인공지능은 현대 사회 전반에 걸쳐 급속도로 확산되고 있으며, 특히 자연어 처리 기술..."
3,"대규모 언어 모델은 방대한 양의 텍스트 데이터를 기반으로 학습되며, 이를 통해 문장..."


In [18]:
import torch
import joblib

# 1) Discriminator 가중치 저장
MODEL_PATH = "/content/D.pth"
torch.save(D.state_dict(), MODEL_PATH)

# 2) TF-IDF vectorizer 저장
VEC_PATH = "/content/vectorizer.pkl"
joblib.dump(vectorizer, VEC_PATH)

print("Saved:", MODEL_PATH)
print("Saved:", VEC_PATH)



Saved: /content/D.pth
Saved: /content/vectorizer.pkl


In [19]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
BASE_DIR = "/content/drive/MyDrive/ai_text "

!ls -lh "/content/drive/MyDrive"      # ai_text 폴더 확인
!cp /content/D.pth "$BASE_DIR/D.pth"
!cp /content/vectorizer.pkl "$BASE_DIR/vectorizer.pkl"

!ls -lh "$BASE_DIR"


total 11M
-rw------- 1 root root  71K Mar 22  2021  123.bmp
-rw------- 1 root root  175 Mar 22  2021  123.gdoc
-rw------- 1 root root 2.8M Jan 17  2021  20210117_133913.jpg
-rw------- 1 root root 2.2M Aug  5  2022  20220805_132906.jpg
-rw------- 1 root root 2.5M Aug  5  2022  20220805_134107.jpg
-rw------- 1 root root 2.2M Sep 13  2022  20220913_112450.jpg
drwx------ 2 root root 4.0K Nov 25 12:15 'ai_text '
drwx------ 2 root root 4.0K Nov 24 12:38 'Colab Notebooks'
-rw------- 1 root root 138K Jan 17  2024  KakaoTalk_20240117_163259479_01.jpg
-rw------- 1 root root 194K Jan 17  2024  KakaoTalk_20240117_163259479_02.jpg
-rw------- 1 root root 234K Jan 17  2024  KakaoTalk_20240117_163259479_03.jpg
-rw------- 1 root root 142K Jan 17  2024  KakaoTalk_20240117_163259479_04.jpg
-rw------- 1 root root 154K Jan 17  2024  KakaoTalk_20240117_163259479.jpg
-rw------- 1 root root  584 Jul 16 16:06 '스크린샷 2025-07-17 010635.png'
-rw------- 1 root root  175 Sep 11  2023 '연락처 정보.gform'
-rw