In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

## BFS - Feature set 1

In [None]:
obf_df = pd.read_csv("obf_bf.csv")
out_df = pd.read_csv("real_bf.csv")
obf_df['class']='obf'
out_df['class']='real'
df = pd.concat([obf_df,out_df])
df = df.iloc[: , 1:]
df.drop('filename', axis=1, inplace=True)
training_set, test_set = train_test_split(df, test_size = 0.2, random_state = 1)
X = df.iloc[:,0:188]
Y = df.iloc[:,188]
X_train = training_set.iloc[:,0:188]
Y_train = training_set.iloc[:,188]
X_test = test_set.iloc[:,0:188]
Y_test = test_set.iloc[:,188]

### Decision Tree

In [None]:
classifier = DecisionTreeClassifier(random_state = 1)
classifier.fit(X_train,Y_train)

DecisionTreeClassifier(random_state=1)

In [None]:
Y_pred = classifier.predict(X_test)

test_set["Tree"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.9463087248322147


In [None]:
parameters = {
    'criterion': ('gini','entropy'), 
    'splitter': ('best','random'), 
    'max_depth': [3,4,5,8,10,13,15,20,28,35,42,50], 
    'min_samples_split': [2,3,5,8,10,15],
    'max_features':range(1,189),
    'min_samples_leaf' : [1, 2, 5, 10]
}
optimization = GridSearchCV(classifier, parameters,cv=7)
optimization.fit(X_train, Y_train)
params = optimization.best_params_

In [None]:
print(params)

{'criterion': 'gini', 'max_depth': 3, 'max_features': 33, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}


In [None]:
classifier = DecisionTreeClassifier(criterion='gini',max_depth=3,max_features=33,min_samples_leaf=1,min_samples_split=2,splitter='best')
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)

test_set["H-Tree"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.9530201342281879


### Random Forest

In [None]:
classifier = RandomForestClassifier(random_state = 1)
classifier.fit(X_train,Y_train)

RandomForestClassifier(random_state=1)

In [None]:
Y_pred = classifier.predict(X_test)

test_set["Forest"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.9664429530201343


In [None]:
parameters = {
    'n_estimators': range(10,300,5),
    'criterion': ('gini','entropy'), 
    'max_depth': [3,4,5,8,10,13,15,20,28,35,42,50], 
    'min_samples_split': [2,3,5,8,10,15],
    'max_features':range(1,189),
    'min_samples_leaf' : [1, 2, 5, 10],
    'bootstrap': [True, False]
}
optimization = RandomizedSearchCV(classifier, parameters,cv=7,n_iter=300)
optimization.fit(X_train, Y_train)
params = optimization.best_params_

In [None]:
print(params)

{'n_estimators': 55, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 32, 'max_depth': 13, 'criterion': 'entropy', 'bootstrap': False}


In [None]:
classifier = RandomForestClassifier(n_estimators=55,min_samples_split=5,min_samples_leaf=1,max_features=32,max_depth=13,criterion='entropy',bootstrap=False)
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)

test_set["H-Forest"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.9664429530201343


### SVM

In [None]:
classifier = SVC()
classifier.fit(X_train,Y_train)

SVC()

In [None]:
Y_pred = classifier.predict(X_test)

test_set["SVM"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.9395973154362416


In [None]:
parameters = {
    'kernel':('linear','rbf','poly', 'sigmoid'),
    'C': [0.1, 1, 10, 100, 1000],
    'gamma' : [0.001, 0.01, 0.1, 1, 10, 100],
    'degree' : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}
optimization = GridSearchCV(classifier, parameters,cv=7)
optimization.fit(X_train, Y_train)
params = optimization.best_params_
print(params)

In [None]:
classifier = SVC(C=0.1, degree=1, gamma=0.01, kernel='poly')
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)

test_set["H-SVM"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.9731543624161074


## SFS - Feature set 2

In [None]:
obf_df = pd.read_csv("obf_sfs.csv")
out_df = pd.read_csv("real_sfs.csv")
obf_df['class']='obf'
out_df['class']='real'
df = pd.concat([obf_df,out_df])
df = df.iloc[: , 1:]
df.drop('filename', axis=1, inplace=True)
training_set, test_set = train_test_split(df, test_size = 0.2, random_state = 1)
X = df.iloc[:,0:160]
Y = df.iloc[:,160]
X_train = training_set.iloc[:,0:160]
Y_train = training_set.iloc[:,160]
X_test = test_set.iloc[:,0:160]
Y_test = test_set.iloc[:,160]

### Decision Tree

In [None]:
classifier = DecisionTreeClassifier(random_state = 1)
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)

test_set["Tree"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.9328859060402684


In [None]:
parameters = {
    'criterion': ('gini','entropy'), 
    'splitter': ('best','random'), 
    'max_depth': [3,4,5,8,10,13], 
    'min_samples_split': [2,3,5,8,10,15],
    'max_features':range(3,35),
    'min_samples_leaf' : [1, 2, 5, 10]
}
optimization = GridSearchCV(classifier, parameters,cv=7)
optimization.fit(X_train, Y_train)
params = optimization.best_params_
print(params)

{'criterion': 'gini', 'max_depth': 5, 'max_features': 14, 'min_samples_leaf': 5, 'min_samples_split': 2, 'splitter': 'best'}


In [None]:
classifier = DecisionTreeClassifier(criterion='gini',max_depth=5,max_features=14,min_samples_leaf=5,min_samples_split=2,splitter='best')
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)

test_set["H-Tree"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.9530201342281879


### Random Forest

In [None]:
classifier = RandomForestClassifier(random_state = 1)
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)

test_set["Forest"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.9664429530201343


In [None]:
parameters = {
    'n_estimators': range(10,70,2),
    'criterion': ('gini','entropy'), 
    'max_depth': [3,4,5,8,10,13,15,20,28,35,42,50], 
    'min_samples_split': [2,3,5,8,10,15],
    'max_features':range(1,50),
    'min_samples_leaf' : [1, 2, 5],
    'bootstrap': [True, False]
}
optimization = RandomizedSearchCV(classifier, parameters,cv=7,n_iter=500)
optimization.fit(X_train, Y_train)
params = optimization.best_params_
print(params)

{'n_estimators': 26, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 44, 'max_depth': 35, 'criterion': 'entropy', 'bootstrap': False}


In [None]:
classifier = RandomForestClassifier(n_estimators=56,min_samples_split=2,min_samples_leaf=1,max_features=28,max_depth=8,criterion='entropy',bootstrap=False)
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)

test_set["H-Forest"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.959731543624161


### SVM

In [None]:
classifier = SVC()
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)

test_set["SVM"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.9395973154362416


In [None]:
parameters = {
    'kernel':('linear','rbf','poly', 'sigmoid'),
    'C': [0.1, 1, 10, 100, 1000],
    'gamma' : [0.001, 0.01, 0.1, 1, 10, 100],
    'degree' : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}
optimization = GridSearchCV(classifier, parameters,cv=7)
optimization.fit(X_train, Y_train)
params = optimization.best_params_
print(params)

{'C': 0.1, 'degree': 0, 'gamma': 0.001, 'kernel': 'linear'}


In [None]:
classifier = SVC(C=0.1, degree=0, gamma=0.001, kernel='linear')
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)

test_set["H-SVM"] = Y_pred

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.9865771812080537


## QUILLBOT - BF

In [None]:
obf_df = pd.read_csv("obf_bf.csv")
out_df = pd.read_csv("real_bf.csv")
quill_df = pd.read_csv("quillbot_bf.csv")

obf_df['class']='obf'
out_df['class']='real'
quill_df['class']='obf'

df = pd.concat([obf_df,out_df])
df = df.iloc[: , 1:]
df.drop('filename', axis=1, inplace=True)
training_set, test_set = train_test_split(df, test_size = 0.2, random_state = 1)
X1 = df.iloc[:,0:188]
Y1 = df.iloc[:,188]
X1_train = training_set.iloc[:,0:188]
Y1_train = training_set.iloc[:,188]
X1_test = test_set.iloc[:,0:188]
Y1_test = test_set.iloc[:,188]

df = pd.concat([quill_df,out_df])
df = df.iloc[: , 1:]
df.drop('filename', axis=1, inplace=True)
training_set, test_set = train_test_split(df, test_size = 0.2, random_state = 1)
X2 = df.iloc[:,0:188]
Y2 = df.iloc[:,188]
X2_train = training_set.iloc[:,0:188]
Y2_train = training_set.iloc[:,188]
X2_test = test_set.iloc[:,0:188]
Y2_test = test_set.iloc[:,188]


In [None]:
classifier = SVC()
classifier.fit(X2_train,Y2_train)

SVC()

In [None]:
Y2_pred = classifier.predict(X2_test)

test_set["SVM"] = Y2_pred

In [None]:
print(accuracy_score(Y2_test,Y2_pred))

0.9395973154362416


In [None]:
classifier = SVC(C=0.1, degree=1, gamma=0.01, kernel='poly')
classifier.fit(X2_train,Y2_train)
Y2_pred = classifier.predict(X2_test)

test_set["H-SVM"] = Y2_pred

In [None]:
print(accuracy_score(Y2_test,Y2_pred))

0.9664429530201343


## QUILLBOT - SFS

In [None]:
obf_df = pd.read_csv("obf_sfs.csv")
out_df = pd.read_csv("real_sfs.csv")
quill_df = pd.read_csv("quillbot_sfs.csv")

obf_df['class']='obf'
out_df['class']='real'
quill_df['class']='obf'

df = pd.concat([obf_df,out_df])
df = df.iloc[: , 1:]
df.drop('filename', axis=1, inplace=True)
training_set, test_set = train_test_split(df, test_size = 0.2, random_state = 1)
X1 = df.iloc[:,0:160]
Y1 = df.iloc[:,160]
X1_train = training_set.iloc[:,0:160]
Y1_train = training_set.iloc[:,160]
X1_test = test_set.iloc[:,0:160]
Y1_test = test_set.iloc[:,160]

df = pd.concat([quill_df,out_df])
df = df.iloc[: , 1:]
df.drop('filename', axis=1, inplace=True)
training_set, test_set = train_test_split(df, test_size = 0.2, random_state = 1)
X2 = df.iloc[:,0:160]
Y2 = df.iloc[:,160]
X2_train = training_set.iloc[:,0:160]
Y2_train = training_set.iloc[:,160]
X2_test = test_set.iloc[:,0:160]
Y2_test = test_set.iloc[:,160]

In [None]:
classifier = SVC()
classifier.fit(X1_train,Y1_train)

SVC()

In [None]:
Y2_pred = classifier.predict(X2_test)

test_set["SVM"] = Y2_pred

In [None]:
print(accuracy_score(Y2_test,Y2_pred))

0.9395973154362416


In [None]:
classifier = SVC(C=0.1, degree=0, gamma=0.001, kernel='linear')
classifier.fit(X1_train,Y1_train)
Y2_pred = classifier.predict(X2_test)

test_set["H-SVM"] = Y2_pred

In [None]:
print(accuracy_score(Y2_test,Y2_pred))

0.9194630872483222


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=dd350cc0-81ed-480c-baaf-f203ee8ea981' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>