In [None]:
!pip install scikit-dimension

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaModel

from scipy.spatial.distance import cdist
from skdim.id import MLE

from tqdm import tqdm

In [None]:
from scipy.spatial.distance import cdist
from threading import Thread

MINIMAL_CLOUD = 47

def prim_tree(adj_matrix, alpha=1.0):
    infty = np.max(adj_matrix) + 10

    dst = np.ones(adj_matrix.shape[0]) * infty
    visited = np.zeros(adj_matrix.shape[0], dtype=bool)
    ancestor = -np.ones(adj_matrix.shape[0], dtype=int)

    v, s = 0, 0.0
    for i in range(adj_matrix.shape[0] - 1):
        visited[v] = 1
        ancestor[dst > adj_matrix[v]] = v
        dst = np.minimum(dst, adj_matrix[v])
        dst[visited] = infty

        v = np.argmin(dst)
        s += (adj_matrix[v][ancestor[v]] ** alpha)

    return s.item()

def process_string(sss):
    return sss.replace('\n', ' ').replace('  ', ' ')

class PHD():
    def __init__(self, alpha=1.0, metric='euclidean', n_reruns=3, n_points=7, n_points_min=3):

# Initializes the instance of PH-dim computer
# Parameters:
# 	1) alpha --- real-valued parameter Alpha for computing PH-dim (see the reference paper). Alpha should be chosen lower than
# the ground-truth Intrinsic Dimensionality; however, Alpha=1.0 works just fine for our kind of data.
# 	2) metric --- String or Callable, distance function for the metric space (see documentation for Scipy.cdist)
# 	3) n_reruns --- Number of restarts of whole calculations (each restart is made in a separate thread)
# 	4) n_points --- Number of subsamples to be drawn at each subsample
# 	5) n_points_min --- Number of subsamples to be drawn at larger subsamples (more than half of the point cloud)
        self.alpha = alpha
        self.n_reruns = n_reruns
        self.n_points = n_points
        self.n_points_min = n_points_min
        self.metric = metric
        self.is_fitted_ = False

    def _sample_W(self, W, nSamples):
        n = W.shape[0]
        random_indices = np.random.choice(n, size=nSamples, replace=False)
        return W[random_indices]

    def _calc_ph_dim_single(self, W, test_n, outp, thread_id):
        lengths = []
        for n in test_n:
            if W.shape[0] <= 2 * n:
                restarts = self.n_points_min
            else:
                restarts = self.n_points
            reruns = np.ones(restarts)
            for i in range(restarts):
                tmp = self._sample_W(W, n)
                reruns[i] = prim_tree(cdist(tmp, tmp, metric=self.metric), self.alpha)

            lengths.append(np.median(reruns))
        lengths = np.array(lengths)

        x = np.log(np.array(list(test_n)))
        y = np.log(lengths)
        N = len(x)
        outp[thread_id] = (N * (x * y).sum() - x.sum() * y.sum()) / (N * (x ** 2).sum() - x.sum() ** 2)

    def fit_transform(self, X, y=None, min_points=50, max_points=512, point_jump=40):

# Computing the PH-dim
# Parameters:
# 	1) X --- point cloud of shape (n_points, n_features),
# 	2) y --- fictional parameter to fit with Sklearn interface
# 	3) min_points --- size of minimal subsample to be drawn
# 	4) max_points --- size of maximal subsample to be drawn
# 	5) point_jump --- step between subsamples

        ms = np.zeros(self.n_reruns)
        test_n = range(min_points, max_points, point_jump)
        threads = []

        for i in range(self.n_reruns):
            threads.append(Thread(target=self._calc_ph_dim_single, args=[X, test_n, ms, i]))
            threads[-1].start()

        for i in range(self.n_reruns):
            threads[i].join()

        m = np.mean(ms)
        return 1 / (1 - m)



Load the tokenizer and language model.

In [None]:
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)


Load multilingual model

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaModel.from_pretrained('xlm-roberta-base')

In [None]:
"""
Our method (PHD) is stochastic, here are some magic constants for it. They are chosen specifically for text data. If you plan to use this code for something different, consider testing other values.

MIN_SUBSAMPLE       --- the size of the minimal subsample to be drawn in procedure. Lesser values yields less statisitcally stable predictions.
INTERMEDIATE_POINTS --- number of sumsamples to be drawn. The more this number is, the more stable dimension estimation for single text is; however,  the computational time is higher, too. 7 is, empirically, the best trade-off.
"""
MIN_SUBSAMPLE = 40
INTERMEDIATE_POINTS = 7

In [None]:
'''
Auxillary function. Clear text from linebreaks and odd whitespaces, because they seem to interfer with LM quite a lot.
Replace with a more sophisticated cleaner, if needed.
'''

def preprocess_text(text):
    return text.replace('\n', ' ').replace('  ', ' ')

In [None]:
'''
Get PHD for one text
Parameters:
        text  --- text
        solver --- PHD computator

Returns:
    real number or NumPy.nan  --- Intrinsic dimension value of the text in the input data
                                                    estimated by Persistence Homology Dimension method.'''
def get_phd_single(text, solver):
    inputs = tokenizer(preprocess_text(text), truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        outp = model(**inputs)

    # We omit the first and last tokens (<CLS> and <SEP> because they do not directly correspond to any part of the)
    mx_points = inputs['input_ids'].shape[1] - 2


    mn_points = MIN_SUBSAMPLE
    step = ( mx_points - mn_points ) // INTERMEDIATE_POINTS 
    if step == 0: step = 1

    return solver.fit_transform(outp[0][0].numpy()[1:-1],  min_points=mn_points, max_points=mx_points - step, \
                                point_jump=step)

'''
Get PHD for all texts in df[key] Pandas DataSeries (PHD method)
Parameters:
        df  --- Pandas DataFrame
        key --- Name of the column
        is_list --- Check if the elements of the df[key] are lists (appears in some data)

        alpha --- Parameter alpha for PHD computattion

Returns:
    numpy.array of shape (number_of_texts, 1) --- Intrinsic dimension values for all texts in the input data
                                                    estimated by Persistence Homology Dimension method.
'''

def get_phd(df, train_idx, split, key='text', is_list=False, alpha=1.0):
    dims = []
    PHD_solver = PHD(alpha=alpha, metric='euclidean', n_points=9)
    # for s in tqdm(df[key]):
    #     if is_list:
    #         text = s[0]
    #     else:
    #         text = s
    i = 0
    for s in tqdm(df):
        if i >= train_idx: break
        if s['split'] != split: continue
        if is_list:
            text = s[key][0]
        else: text = s[key]
        dims.append(get_phd_single(text, PHD_solver))
        i += 1

    return np.array(dims).reshape(-1, 1)

In [None]:
'''
Get MLE for one text
Parameters:
        text  --- text
        solver --- MLE computator

Returns:
    real number or NumPy.nan  --- Intrinsic dimension value of the text in the input data
                                                    estimated by Maximum Likelihood Estimation method.'''
def get_mle_single(text, solver):
    inputs = tokenizer(preprocess_text(text), truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        outp = model(**inputs)

    return solver.fit_transform(outp[0][0].numpy()[1:-1])

'''
Get PHD for all texts in df[key] Pandas DataSeries (PHD method)
Parameters:
        df  --- Pandas DataFrame
        key --- Name of the column
        is_list --- Check if the elements of the df[key] are lists (appears in some data)

Returns:
    numpy.array of shape (number_of_texts, 1) --- Intrinsic dimension values for all texts in the input data
                                                    estimated by Maximum Likelihood Estimation method.
'''

def get_mle(df, train_idx, split, key='text', is_list=False):
    dims = []
    MLE_solver = MLE()
    # for s in tqdm(df[key]):
    #     if is_list:
    #         text = s[0]
    #     else:
    #         text = s
    #     dims.append(get_mle_single(text, MLE_solver))
    i = 0
    for s in tqdm(df):
        if i >= train_idx: break
        if s['split'] != split: continue
        if is_list:
            text = s[key][0]
        else: text = s[key]
        dims.append(get_mle_single(text, MLE_solver))
        i += 1

    return np.array(dims).reshape(-1, 1)

# An example


### PHD & MLE

In [None]:
sample_text = "Speaking of festivities, there is one day in China that stands unrivaled - the first day of the Lunar New Year, commonly referred to as the Spring Festival. Even if you're generally uninterested in celebratory events, it's hard to resist the allure of the family reunion dinner, a quintessential aspect of the Spring Festival. Throughout the meal, family members raise their glasses to toast one another, expressing wishes for happiness, peace, health, and prosperity in the upcoming year."

In [None]:
print("PHD estimation of the Intrinsic dimension of sample text is ", get_phd_single(sample_text, PHD(alpha=1.0, metric='euclidean', n_points=9)))

In [None]:
#name of columns

#print(reddit_data.iloc[0][0]['paraphrase_outputs']['lex_40_order_40']['output']) #тут выдается текст сгенеренный

# lex_40_order_40
# lex_40_order_100
# lex_60_order_40
# lex_60_order_100
# lex_80_order_40
# lex_80_order_100

# print(reddit_data.iloc[0][1900]['split'])

# for key in reddit_data.iloc[0][0]['split'].keys():
#     print(key)
    
    
# split = []
    

# for i in range(len(reddit_data.iloc[0])):
#     if reddit_data.iloc[0][i]['split'] not in par:
#         par.append(reddit_data.iloc[0][i]['split'])
# print(par)
    

In [None]:
#Training subset
import sys

reddit_data = pd.read_json("/kaggle/input/gptid-data/human_gpt3_davinci_003_reddit.json_pp", lines=True)
train_idx = sys.maxsize

# column_names = ['prefix', 'text', 'split', 'gen', 'paraphrase', 'lex', 'order', 'source', 'method', 'dim']

# file_path = 'gptid.csv'

# with open(file_path, 'w', newline='') as csv_file:
#     writer = csv.DictWriter(csv_file, fieldnames=column_names)
#     writer.writeheader()
#     for i in range(len(reddit_data.iloc[0])):
#         new_row = [reddit_data.iloc[0][i]['prefix'], reddit_data.iloc[0][i]['gold_completion'], reddit_data.iloc[0][i]['split'], 0, 0, 0, 0, 'gpt3', 0]
#     writer.writerow(new_row)
    
#train_idx = 10

# human_phd_train_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'train', 'gold_completion'))
# opt_phd_train_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'train', 'gen_completion'))
# human_phd_train_en.to_csv('phd_reddit_human_train', index=False)
# opt_phd_train_en.to_csv("phd_reddit_gpt3_train", index=False)

# human_phd_test_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'test', 'gold_completion'))
# opt_phd_test_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'test', 'gen_completion'))
# human_phd_test_en.to_csv('phd_reddit_human_test', index=False)
# opt_phd_test_en.to_csv("phd_reddit_gpt3_test", index=False)


#нужное для сбора данных про размерность
# human_phd_val_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'validation', 'gold_completion'))
# opt_phd_val_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'validation', 'gen_completion'))
# human_phd_val_en.to_csv('phd_reddit_human_val', index=False)
# opt_phd_val_en.to_csv("phd_reddit_gpt3_val", index=False)




mle

In [None]:


# human_mle_train_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'train', 'gold_completion'))
# opt_mle_train_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'train', 'gen_completion'))
# human_mle_train_en.to_csv('mle_reddit_human_train', index=False)
# opt_mle_train_en.to_csv("mle_reddit_gpt3_train", index=False)

# human_mle_test_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'test', 'gold_completion'))
# opt_mle_test_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'test', 'gen_completion'))
# human_mle_test_en.to_csv('mle_reddit_human_test', index=False)
# opt_mle_test_en.to_csv("mle_reddit_gpt3_test", index=False)

# human_mle_val_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'validation', 'gold_completion'))
# opt_mle_val_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'validation', 'gen_completion'))
# human_mle_val_en.to_csv('mle_reddit_human_val', index=False)
# opt_mle_val_en.to_csv("mle_reddit_gpt3_val", index=False)


In [None]:
# reddit_data = pd.read_json("/kaggle/input/gptid-data/human_gpt2_wikip.json_pp", lines=True)

# human_phd_train_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'train', 'gold_completion'))
# opt_phd_train_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'train', 'gen_completion', is_list = True))
# human_phd_train_en.to_csv('phd_reddit_human2_train', index=False)
# opt_phd_train_en.to_csv("phd_reddit_gpt2_train", index=False)

# human_phd_test_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'test', 'gold_completion'))
# opt_phd_test_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'test', 'gen_completion', is_list = True))
# human_phd_test_en.to_csv('phd_reddit_human2_test', index=False)
# opt_phd_test_en.to_csv("phd_reddit_gpt2_test", index=False)

# human_phd_val_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'validation', 'gold_completion'))
# opt_phd_val_en = pd.DataFrame(get_phd(reddit_data.iloc[0], train_idx, 'validation', 'gen_completion', is_list = True))
# human_phd_val_en.to_csv('phd_reddit_human2_val', index=False)
# opt_phd_val_en.to_csv("phd_reddit_gpt2_val", index=False)


mle

In [None]:
# human_mle_train_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'train', 'gold_completion'))
# opt_mle_train_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'train', 'gen_completion', is_list = True))
# human_mle_train_en.to_csv('mle_reddit_human2_train', index=False)
# opt_mle_train_en.to_csv("mle_reddit_gpt2_train", index=False)

# human_mle_test_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'test', 'gold_completion'))
# opt_mle_test_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'test', 'gen_completion', is_list = True))
# human_mle_test_en.to_csv('mle_reddit_human2_test', index=False)
# opt_mle_test_en.to_csv("mle_reddit_gpt2_test", index=False)

# human_mle_val_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'validation', 'gold_completion'))
# opt_mle_val_en = pd.DataFrame(get_mle(reddit_data.iloc[0], train_idx, 'validation', 'gen_completion', is_list = True))
# human_mle_val_en.to_csv('mle_reddit_human2_val', index=False)
# opt_mle_val_en.to_csv("mle_reddit_gpt2_val", index=False)

In [None]:
# wiki_data = pd.read_json("/kaggle/input/gptid-data/human_gpt3_davinci_003_wikip.json_pp", lines=True)

# human_phd_train_en =  pd.DataFrame(get_phd(wiki_data.iloc[0], train_idx, 'train', 'gold_completion'))
# opt_phd_train_en =  pd.DataFrame(get_phd(wiki_data.iloc[0], train_idx, 'train', 'gen_completion'))
# human_phd_train_en.to_csv('phd_wiki_human_train', index=False)
# opt_phd_train_en.to_csv("phd_wiki_gpt3_train", index=False)

# human_phd_test_en =  pd.DataFrame(get_phd(wiki_data.iloc[0], train_idx, 'test', 'gold_completion'))
# opt_phd_test_en =  pd.DataFrame(get_phd(wiki_data.iloc[0], train_idx, 'test', 'gen_completion'))
# human_phd_test_en.to_csv('phd_wiki_human_test', index=False)
# opt_phd_test_en.to_csv("phd_wiki_gpt3_test", index=False)

# human_phd_val_en =  pd.DataFrame(get_phd(wiki_data.iloc[0], train_idx, 'validation', 'gold_completion'))
# opt_phd_val_en =  pd.DataFrame(get_phd(wiki_data.iloc[0], train_idx, 'validation', 'gen_completion'))
# human_phd_val_en.to_csv('phd_wiki_human2_val', index=False)
# opt_phd_val_en.to_csv("phd_wiki_gpt3_val", index=False)

mle

In [None]:
# human_mle_train_en =  pd.DataFrame(get_mle(wiki_data.iloc[0], train_idx, 'train', 'gold_completion'))
# opt_mle_train_en =  pd.DataFrame(get_mle(wiki_data.iloc[0], train_idx, 'train', 'gen_completion'))
# human_mle_train_en.to_csv('mle_wiki_human_train', index=False)
# opt_mle_train_en.to_csv("mle_wiki_gpt3_train", index=False)

# human_mle_test_en =  pd.DataFrame(get_mle(wiki_data.iloc[0], train_idx, 'test', 'gold_completion'))
# opt_mle_test_en =  pd.DataFrame(get_mle(wiki_data.iloc[0], train_idx, 'test', 'gen_completion'))
# human_mle_test_en.to_csv('mle_wiki_human_test', index=False)
# opt_mle_test_en.to_csv("mle_wiki_gpt3_test", index=False)

# human_mle_val_en =  pd.DataFrame(get_mle(wiki_data.iloc[0], train_idx, 'validation', 'gold_completion'))
# opt_mle_val_en =  pd.DataFrame(get_mle(wiki_data.iloc[0], train_idx, 'validation', 'gen_completion'))
# human_mle_val_en.to_csv('mle_wiki_human2_val', index=False)
# opt_mle_val_en.to_csv("mle_wiki_gpt3_val", index=False)

In [None]:
# wiki_data = pd.read_json("/kaggle/input/gptid-data/human_opt13_wikip.json_pp", lines=True)


# human_phd_train_en =  pd.DataFrame(get_phd(wiki_data.iloc[0], train_idx, 'train', 'gold_completion'))
# opt_phd_train_en =  pd.DataFrame(get_phd(wiki_data.iloc[0], train_idx, 'train', 'gen_completion', is_list = True))
# human_phd_train_en.to_csv('phd_wiki_human13_train', index=False)
# opt_phd_train_en.to_csv("phd_wiki_opt13_train", index=False)

# human_phd_test_en = pd.DataFrame( get_phd(wiki_data.iloc[0], train_idx, 'test', 'gold_completion'))
# opt_phd_test_en =  pd.DataFrame(get_phd(wiki_data.iloc[0], train_idx, 'test', 'gen_completion', is_list = True))
# human_phd_test_en.to_csv('phd_wiki_human13_test', index=False)
# opt_phd_test_en.to_csv("phd_wiki_opt13_test", index=False)

# human_phd_val_en =  pd.DataFrame(get_phd(wiki_data.iloc[0], train_idx, 'validation', 'gold_completion'))
# opt_phd_val_en =  pd.DataFrame(get_phd(wiki_data.iloc[0], train_idx, 'validation', 'gen_completion', is_list = True))
# human_phd_val_en.to_csv('phd_wiki_human13_val', index=False)
# opt_phd_val_en.to_csv("phd_wiki_opt13_val", index=False)

mle

In [None]:
# human_mle_train_en =  pd.DataFrame(get_mle(wiki_data.iloc[0], train_idx, 'train', 'gold_completion'))
# opt_mle_train_en =  pd.DataFrame(get_mle(wiki_data.iloc[0], train_idx, 'train', 'gen_completion', is_list = True))
# human_mle_train_en.to_csv('mle_wiki_human13_train', index=False)
# opt_mle_train_en.to_csv("mle_wiki_opt13_train", index=False)

# human_mle_test_en = pd.DataFrame( get_mle(wiki_data.iloc[0], train_idx, 'test', 'gold_completion'))
# opt_mle_test_en =  pd.DataFrame(get_mle(wiki_data.iloc[0], train_idx, 'test', 'gen_completion', is_list = True))
# human_mle_test_en.to_csv('mle_wiki_human13_test', index=False)
# opt_mle_test_en.to_csv("mle_wiki_opt13_test", index=False)

# human_mle_val_en =  pd.DataFrame(get_mle(wiki_data.iloc[0], train_idx, 'validation', 'gold_completion'))
# opt_mle_val_en =  pd.DataFrame(get_mle(wiki_data.iloc[0], train_idx, 'validation', 'gen_completion', is_list = True))
# human_mle_val_en.to_csv('mle_wiki_human13_val', index=False)
# opt_mle_val_en.to_csv("mle_wiki_opt13_val", index=False)


### MLE

In [None]:
sample_text = "Speaking of festivities, there is one day in China that stands unrivaled - the first day of the Lunar New Year, commonly referred to as the Spring Festival. Even if you're generally uninterested in celebratory events, it's hard to resist the allure of the family reunion dinner, a quintessential aspect of the Spring Festival. Throughout the meal, family members raise their glasses to toast one another, expressing wishes for happiness, peace, health, and prosperity in the upcoming year."

In [None]:
print("MLE estimation of the Intrinsic dimension of sample text is ", get_mle_single(sample_text, MLE()))

## Строим классификатор.

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

def check(arr):
    ans = []
    for i in range(len(arr)):
        if arr['0'][i] < 0 or math.isnan(arr['0'][i]): 
            ans.append(i)
    return arr.drop(ans)
            
# Load your datasets (train, test, validation)
train_df_gpt = pd.read_csv('/kaggle/input/dimensions/phd_wiki_gpt3_train')
train_df_gpt = check(train_df_gpt)

train_df_human = pd.read_csv('/kaggle/input/dimensions/phd_wiki_human_train')
train_df_human = check(train_df_human)
# Combine the DataFrames vertically
train_df = pd.concat([train_df_gpt, train_df_human], ignore_index=True)

# Create a new column with zeros for elements from the first DataFrame and ones for elements from the second DataFrame
train_df['label'] = pd.Series(['generated'] * len(train_df_gpt) + ['human'] * len(train_df_human))
train_df = train_df.rename(columns={'0': 'dim'})


test_df_gpt = pd.read_csv('/kaggle/input/dimensions/phd_wiki_gpt3_test')
test_df_gpt = check(test_df_gpt)
test_df_human = pd.read_csv('/kaggle/input/dimensions/phd_wiki_human_test')
test_df_human = check(test_df_human)

test_df = pd.concat([test_df_gpt, test_df_human], ignore_index=True)

test_df['label'] = pd.Series(['generated'] * len(test_df_gpt) + ['human'] * len(test_df_human))
test_df = test_df.rename(columns={'0': 'dim'})

# print(test_df)

validation_df_gpt = pd.read_csv('/kaggle/input/dimensions/phd_wiki_gpt3_val')
validation_df_gpt = check(validation_df_gpt)
validation_df_human = pd.read_csv('/kaggle/input/dimensions/phd_wiki_human2_val')
validation_df_human = check(validation_df_human)

validation_df = pd.concat([validation_df_gpt, validation_df_human], ignore_index=True)

validation_df['label'] = pd.Series(['generated'] * len(validation_df_gpt) + ['human'] * len(validation_df_human))
validation_df = validation_df.rename(columns={'0': 'dim'})

# print(validation_df)

full = pd.concat([train_df, test_df], ignore_index=True)
X_train, X_test, y_train, y_test = train_test_split(full['dim'], full['label'], test_size=0.3, random_state=42)


# Assuming your datasets have 'text' column and 'label' column
# X_train, y_train = train_df['dim'], train_df['label']

X_train = np.array(X_train).reshape(-1, 1)
# pd.DataFrame(X_train).to_csv("test", index=False)
# X_test, y_test = test_df['dim'], test_df['label']
X_test = np.array(X_test).reshape(-1, 1)
X_validation, y_validation = validation_df['dim'], validation_df['label']
X_validation = np.array(X_validation).reshape(-1, 1)


# # Define a pipeline for the classifier
# classifier = make_pipeline(
#     SVC()  # Support Vector Classifier
# )

classifier = LogisticRegression()
random_forest_classifier = RandomForestClassifier()
svm_classifier = SVC()
knn_classifier = KNeighborsClassifier()
gradient_boosting_classifier = GradientBoostingClassifier()

# # Train the classifier
classifier.fit(X_train, y_train)
random_forest_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)
knn_classifier.fit(X_train, y_train)
gradient_boosting_classifier.fit(X_train, y_train)

# # Predict on test data
y_pred = classifier.predict(X_test)
rf_predictions = random_forest_classifier.predict(X_test)
svm_predictions = svm_classifier.predict(X_test)
knn_predictions = knn_classifier.predict(X_test)
gb_predictions = gradient_boosting_classifier.predict(X_test)

# # Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

report_rf = classification_report(y_test,rf_predictions)
accuracy_rf = accuracy_score(y_test, rf_predictions)

report_svm = classification_report(y_test, svm_predictions)
accuracy_svm = accuracy_score(y_test, svm_predictions)

report_knn = classification_report(y_test, knn_predictions)
accuracy_knn = accuracy_score(y_test, knn_predictions)

report_gb = classification_report(y_test, gb_predictions)
accuracy_gb = accuracy_score(y_test, gb_predictions)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
validation_accuracy = classifier.score(X_validation, y_validation)
print("Validation Accuracy:", validation_accuracy)

print("-------------------------------------------------------------")

print("Accuracy:", accuracy_rf)
print("Classification RF Report:\n", report_rf)
validation_accuracy = random_forest_classifier.score(X_validation, y_validation)
print("Validation Accuracy:", validation_accuracy)

print("-------------------------------------------------------------")

print("Accuracy:", accuracy_svm)
print("Classification SVM Report:\n", report_svm)
validation_accuracy = svm_classifier.score(X_validation, y_validation)
print("Validation Accuracy:", validation_accuracy)

print("-------------------------------------------------------------")


print("Accuracy:", accuracy_knn)
print("Classification KNN Report:\n", report_knn)
validation_accuracy = knn_classifier.score(X_validation, y_validation)
print("Validation Accuracy:", validation_accuracy)

print("-------------------------------------------------------------")

print("Accuracy:", accuracy_gb)
print("Classification GB Report:\n", report_gb)
validation_accuracy = gradient_boosting_classifier.score(X_validation, y_validation)
print("Validation Accuracy:", validation_accuracy)

# Validate the classifier on validation data
# validation_accuracy = classifier.score(X_validation, y_validation)
# print("Validation Accuracy:", validation_accuracy)


Accuracy: 0.7828200972447326
Classification Report:
               precision    recall  f1-score   support

   generated       0.80      0.76      0.78       633
       human       0.76      0.80      0.78       601

    accuracy                           0.78      1234
   macro avg       0.78      0.78      0.78      1234
weighted avg       0.78      0.78      0.78      1234

Validation Accuracy: 0.7653846153846153
-------------------------------------------------------------
Accuracy: 0.7090761750405187
Classification RF Report:
               precision    recall  f1-score   support

   generated       0.72      0.70      0.71       633
       human       0.69      0.72      0.71       601

    accuracy                           0.71      1234
   macro avg       0.71      0.71      0.71      1234
weighted avg       0.71      0.71      0.71      1234

Validation Accuracy: 0.7019230769230769
-------------------------------------------------------------
Accuracy: 0.7925445705024311
Clas

In [6]:
# Load your datasets (train, test, validation)
train_df_gpt = pd.read_csv('/kaggle/input/dimensions/mle_wiki_gpt3_train')
train_df_gpt = check(train_df_gpt)

train_df_human = pd.read_csv('/kaggle/input/dimensions/mle_wiki_human_train')
train_df_human = check(train_df_human)
# Combine the DataFrames vertically
train_df = pd.concat([train_df_gpt, train_df_human], ignore_index=True)

# Create a new column with zeros for elements from the first DataFrame and ones for elements from the second DataFrame
train_df['label'] = pd.Series(['generated'] * len(train_df_gpt) + ['human'] * len(train_df_human))
train_df = train_df.rename(columns={'0': 'dim'})


test_df_gpt = pd.read_csv('/kaggle/input/dimensions/mle_wiki_gpt3_test')
test_df_gpt = check(test_df_gpt)
test_df_human = pd.read_csv('/kaggle/input/dimensions/mle_wiki_human_test')
test_df_human = check(test_df_human)

test_df = pd.concat([test_df_gpt, test_df_human], ignore_index=True)

test_df['label'] = pd.Series(['generated'] * len(test_df_gpt) + ['human'] * len(test_df_human))
test_df = test_df.rename(columns={'0': 'dim'})

# print(test_df)

validation_df_gpt = pd.read_csv('/kaggle/input/dimensions/mle_wiki_gpt3_val')
validation_df_gpt = check(validation_df_gpt)
validation_df_human = pd.read_csv('/kaggle/input/dimensions/mle_wiki_human2_val')
validation_df_human = check(validation_df_human)

validation_df = pd.concat([validation_df_gpt, validation_df_human], ignore_index=True)

validation_df['label'] = pd.Series(['generated'] * len(validation_df_gpt) + ['human'] * len(validation_df_human))
validation_df = validation_df.rename(columns={'0': 'dim'})

# print(validation_df)



# Assuming your datasets have 'text' column and 'label' column
# X_train, y_train = train_df['dim'], train_df['label']

full = pd.concat([train_df, test_df], ignore_index=True)
X_train, X_test, y_train, y_test = train_test_split(full['dim'], full['label'], test_size=0.3, random_state=42)

X_train = np.array(X_train).reshape(-1, 1)
# pd.DataFrame(X_train).to_csv("test", index=False)
# X_test, y_test = test_df['dim'], test_df['label']
X_test = np.array(X_test).reshape(-1, 1)
X_validation, y_validation = validation_df['dim'], validation_df['label']
X_validation = np.array(X_validation).reshape(-1, 1)


# # Define a pipeline for the classifier
# classifier = make_pipeline(
#     SVC()  # Support Vector Classifier
# )

classifier = LogisticRegression()
random_forest_classifier = RandomForestClassifier()
svm_classifier = SVC()
knn_classifier = KNeighborsClassifier()
gradient_boosting_classifier = GradientBoostingClassifier()

# # Train the classifier
classifier.fit(X_train, y_train)
random_forest_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)
knn_classifier.fit(X_train, y_train)
gradient_boosting_classifier.fit(X_train, y_train)

# # Predict on test data
y_pred = classifier.predict(X_test)
rf_predictions = random_forest_classifier.predict(X_test)
svm_predictions = svm_classifier.predict(X_test)
knn_predictions = knn_classifier.predict(X_test)
gb_predictions = gradient_boosting_classifier.predict(X_test)

# # Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

report_rf = classification_report(y_test,rf_predictions)
accuracy_rf = accuracy_score(y_test, rf_predictions)

report_svm = classification_report(y_test, svm_predictions)
accuracy_svm = accuracy_score(y_test, svm_predictions)

report_knn = classification_report(y_test, knn_predictions)
accuracy_knn = accuracy_score(y_test, knn_predictions)

report_gb = classification_report(y_test, gb_predictions)
accuracy_gb = accuracy_score(y_test, gb_predictions)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
validation_accuracy = classifier.score(X_validation, y_validation)
print("Validation Accuracy:", validation_accuracy)

print("-------------------------------------------------------------")

print("Accuracy:", accuracy_rf)
print("Classification RF Report:\n", report_rf)
validation_accuracy = random_forest_classifier.score(X_validation, y_validation)
print("Validation Accuracy:", validation_accuracy)

print("-------------------------------------------------------------")

print("Accuracy:", accuracy_svm)
print("Classification SVM Report:\n", report_svm)
validation_accuracy = svm_classifier.score(X_validation, y_validation)
print("Validation Accuracy:", validation_accuracy)

print("-------------------------------------------------------------")


print("Accuracy:", accuracy_knn)
print("Classification KNN Report:\n", report_knn)
validation_accuracy = knn_classifier.score(X_validation, y_validation)
print("Validation Accuracy:", validation_accuracy)

print("-------------------------------------------------------------")

print("Accuracy:", accuracy_gb)
print("Classification GB Report:\n", report_gb)
validation_accuracy = gradient_boosting_classifier.score(X_validation, y_validation)
print("Validation Accuracy:", validation_accuracy)

# Validate the classifier on validation data
validation_accuracy = classifier.score(X_validation, y_validation)
print("Validation Accuracy:", validation_accuracy)


Accuracy: 0.8170040485829959
Classification Report:
               precision    recall  f1-score   support

   generated       0.83      0.81      0.82       641
       human       0.80      0.82      0.81       594

    accuracy                           0.82      1235
   macro avg       0.82      0.82      0.82      1235
weighted avg       0.82      0.82      0.82      1235

Validation Accuracy: 0.8211538461538461
-------------------------------------------------------------
Accuracy: 0.7894736842105263
Classification RF Report:
               precision    recall  f1-score   support

   generated       0.81      0.78      0.79       641
       human       0.77      0.80      0.78       594

    accuracy                           0.79      1235
   macro avg       0.79      0.79      0.79      1235
weighted avg       0.79      0.79      0.79      1235

Validation Accuracy: 0.8
-------------------------------------------------------------
Accuracy: 0.8307692307692308
Classification SVM 