## Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import xgboost as xgb

## Read data and take a look

Let's start by looking at what columns we have, what their data types are and how many null-values there are.

In [2]:
df = pd.read_csv(
    './outputSURF-AI-testset.csv',
    sep=';'
)

df.drop_duplicates(inplace=True, ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1601 entries, 0 to 1600
Data columns (total 80 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   FacultyName                    1600 non-null   object 
 1   CourseName                     1600 non-null   object 
 2   PredictionSurf                 1195 non-null   object 
 3   PredictionInstitution          876 non-null    object 
 4   PredictionRemark               360 non-null    object 
 5   CorrectDoi                     69 non-null     object 
 6   CorrectISBN                    71 non-null     object 
 7   AnalyseError                   385 non-null    object 
 8   CorrectAnalyseSurf             1601 non-null   int64  
 9   CorrectAnalyseInstitution      1601 non-null   int64  
 10  AnalyseISBN                    1600 non-null   float64
 11  AnalyseDOI                     1600 non-null   float64
 12  id                             1600 non-null   f

#### The following columns contain only null values and will not be used:

<ul>
    <li> 'issn'
    <li> 'usedpages'
    <li> 'filepath'
    <li> 'runidentifier'
    <li> '_10_pics_page'
    <li> 'Kolom1'
</ul>


In [3]:
df.dropna(axis=1, how="all", inplace=True)

#### The following columns concern the predictions and will not be used as input:

<ul>
    <li> 'PredictionSurf'
    <li> 'PredictionInstitution' (ground truth)
    <li> 'PredictionRemark'
    <li> 'prediction'
</ul>

The column 'prediction' will be used as label, therefore all rows where this column has a null-value will be dropped. The other of these columns will be dropped completely.

In [4]:
df.drop(['PredictionSurf', 'PredictionInstitution', 'PredictionRemark'], axis="columns", inplace=True)
df.dropna(subset=['prediction'], inplace=True)

#### The following columns seem to contain identifiers and the like:

<ul>
    <li> 'FacultyName'
    <li> 'CourseName'
    <li> 'CourseName'
    <li> 'CorrectDoi'
    <li> 'CorrectISBN'
    <li> 'AnalyseError'
    <li> 'CorrectAnalyseSurf'
    <li> 'CorrectAnalyseInstitution'
    <li> 'AnalyseISBN'
    <li> 'CorrectAnalyseInstitution'
    <li> 'AnalyseDOI'
    <li> 'id'
    <li> 'uuid'
    <li> 'url'
    <li> 'filesource'
    <li> 'filestatus'
    <li> 'filemimetype'
    <li> 'filename'
    <li> 'filehash'
    <li> 'filedate'
    <li> 'lastmodifieddate'
    <li> 'creator'
    <li> 'isfilepublished'
    <li> 'filescanresults'
    <li> 'doi'
    <li> 'isbn'
    <li> 'author'
    <li> 'title'
    <li> 'publicationyear'
    <li> 'filetype'
    <li> 'oclcnumber'
</ul>

These columns will not be used.

In [5]:
identifier_cols = ['FacultyName', 'CourseName', 'CorrectDoi', 'CorrectISBN',
'AnalyseError', 'CorrectAnalyseSurf', 'CorrectAnalyseInstitution',
'AnalyseISBN', 'AnalyseDOI', 'id', 'uuid', 'url', 'filesource',
'filestatus', 'filemimetype', 'filename', 'filehash', 'filedate',
'lastmodifieddate', 'creator', 'isfilepublished', 'filescanresults',
'doi', 'isbn', 'author', 'title', 'publicationyear',
'filetype', 'oclcnumber']

df.drop(identifier_cols, axis="columns", inplace=True)

#### Some columns only have one possible value

These columns can not be used to distinguish between datapoints and will therefore not be used.

In [6]:
for col in df.columns:
    if len(df[col].unique()) == 1:
        print(f"{col}\t\t\t{df[col].unique()}")
        df.drop(col, axis="columns", inplace=True)

sourcepagecount			[0.]
sourcewordcount			[0.]
userexcludedforscan			[False]
usedmultiplesources			[False]
always			[True]
file_ext_mp3_wav			[False]
file_ext_mp4_mov			[False]
IsJournalWords8000			[False]
Publisher_from_crossref			[False]


## Let's take another look at our columns

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1280 entries, 0 to 1599
Data columns (total 33 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   wordcount                      1280 non-null   float64
 1   pagecount                      1280 non-null   float64
 2   publisher                      71 non-null     object 
 3   incollection                   1280 non-null   object 
 4   isopenaccesstitle              1280 non-null   object 
 5   openaccesslink                 62 non-null     object 
 6   picturecount                   1280 non-null   float64
 7   prediction                     1280 non-null   object 
 8   reliability                    1280 non-null   float64
 9   jstor                          1280 non-null   object 
 10  DOI_in_OA                      1280 non-null   object 
 11  DOI_no_PPT                     1280 non-null   object 
 12  PPT_in_name                    1280 non-null   o

#### Let's take a separate look at columns with dtype object

In [8]:
object_cols = [col for col in df.columns if df[col].dtype == 'object']
len(object_cols)

29

Most columns with dtype object contain boolean values. We will change these to 0s and 1s and change their dtype to int.

In [10]:
for col in object_cols:
    if set(df[col].dropna().unique()) == {False, True}:
        df.loc[df[col] == True, col] = 1
        df.loc[df[col] == False, col] = 0
        df[col] = df[col].fillna(0)
        df[col] = df[col].astype("int")
    elif len(set(df[col].dropna().unique())) == 1:
        df.drop(col, axis=1, inplace=True)

## Make a profiling report

Uncomment and run this cell to get a pandas profiling report. This will show nicely which features are correlated.

In [11]:
# import sys
# !{sys.executable} -m pip install -U pandas-profiling[notebook]
# !jupyter nbextension enable --py widgetsnbextension
# from pandas_profiling import ProfileReport

# profile = ProfileReport(df.reset_index(drop=True), title="Pandas Profiling Report")
# profile.to_file("pandas_report1.html")

#### Last columns of dtype object

The last three columns of dtype object consist of 2 non-boolean features and 1 column with the predictions. 

In [12]:
object_cols = [col for col in df.columns if df[col].dtype == 'object']
len(object_cols)

3

In [13]:
for col in object_cols:
    print(f"column name: {col}")
    print(f"number of unique values: {len(df[col].unique())}")
    print(f"unique values: {df[col].unique()}")
    print(f"number of null values: {df[col].isna().sum()}")
    print("----------------------")

column name: publisher
number of unique values: 51
unique values: ['Trans Tech' 'SAGE Publications, Inc' nan 'SAGE' 'John Wiley & Sons'
 'Rockport Publishers' 'Laurence King Publishing' 'Wiley'
 'Routledge, an imprint of the Taylor & Francis Group'
 'Bloomsbury Academic' 'Taylor & Francis' 'Island Press' 'Routledge'
 'BirkhÃ¤user' 'Earthscan co-published with RIBA Publishing'
 'Routledge/Taylor& Francis Group' 'United Nations' 'the MIT Press'
 'Edition Detail' 'Getty Conservation Institute'
 'The Getty Conservation Institute' 'Cultural Heritage Agency'
 'TU Delft - Heritage & Architecture' 'Univ.-Bibl.'
 'Technische Universiteit Eindhoven, Faculteit Bouwkunde]' 'ICOMOS'
 'Taylor and Francis' 'Princeton Architectural Press' 'BirkhaÌˆuser'
 'Wiley-Blackwell' 'John Wiley & Sons, Incorporated.' 'Birkhauser'
 'Metropolitan Books, Henry Holt and Co.'
 'TU Delft, Heritage & Architecture' 'Penguin Books' 'TU Delft'
 'De Gruyter' 'Ios Press' 'VSSD' 'Routledge Taylor & Francis Group'
 'AEI Press

#### Drop openaccesslink

The 'openaccesslink' feature has 60 unique values, which are uniformly distributed and only once or twice each. This feature has 95% missing values; whether this value is missing highly correlates with the value of 'isopenaccesstitle'.

In [14]:
oa = df.loc[df["isopenaccesstitle"] == 1]
print(f"Number of datapoints that are open access titles:\t\t{len(oa)}.\nNumber of open access titles that have an open access link:\t{oa['openaccesslink'].notna().sum()}.")

print("==========================================================================")

not_oa = df.loc[df["isopenaccesstitle"] == 0]
print(f"Number of datapoints that are NOT open access titles:\t\t{len(not_oa)}.\nNumber of non open access titles that have an open access link:\t{not_oa['openaccesslink'].notna().sum()}.")

Number of datapoints that are open access titles:		68.
Number of open access titles that have an open access link:	62.
Number of datapoints that are NOT open access titles:		1212.
Number of non open access titles that have an open access link:	0.


In [15]:
df.drop("openaccesslink", axis="columns", inplace=True)

#### Drop publisher

The 'publisher' feature is highly correlated with many features, such as wordcount, contains_researchgate, Words_more_300pp and several others. It also has 95% missing values and 38 unique values.

In [16]:
df.drop("publisher", axis="columns", inplace=True)

In [17]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True, ignore_index=True)

## XGBoost model

### High correlation

There are many features with high correlation. Since tree-based models are not so sensitive to this, let's train XGBoost on our data. This model achieves both accuracy and F1 score of 0.98.

In [19]:
x = df.drop("prediction", axis="columns").to_numpy()

print(x.shape)
print(x[:3])

(1158, 30)
[[1.30000e+01 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
 [1.35151e+05 4.14000e+02 0.00000e+00 0.00000e+00 1.34000e+02 1.00000e+01
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00]
 [7.43790e+04 1.94000e+02 0.00000e+00 0.00000e+00 1.94000e+02 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00

In [20]:
label_encoder = LabelEncoder()

y = label_encoder.fit_transform(df["prediction"])

print(y.shape)
print(y[:3])

(1158,)
[2 4 4]


In [21]:
df["prediction"].unique().shape

(8,)

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print(len(y_train))
print(len(y_test))


810
348


In [23]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train, y_train)

predictions = xgb_model.predict(x_test)
report = classification_report(y_test, predictions, target_names=label_encoder.classes_, labels=np.unique(y_train))
print(report)

                                  precision    recall  f1-score   support

        eigen materiaal - overig       0.86      1.00      0.92        18
    eigen materiaal - powerpoint       1.00      1.00      1.00        26
eigen materiaal - titelindicatie       1.00      1.00      1.00        52
                  korte overname       1.00      0.93      0.97        60
                  lange overname       1.00      0.97      0.99        40
            middellange overname       1.00      0.97      0.99       113
                        onbekend       0.83      1.00      0.91        24
                     open access       1.00      1.00      1.00        15

                        accuracy                           0.98       348
                       macro avg       0.96      0.99      0.97       348
                    weighted avg       0.98      0.98      0.98       348



#### With K-fold

Use K-fold to confirm that the accuracy and F1 is actually that high.

In [24]:
skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)

accuracy_scores = 0.0
f1_scores = 0.0

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(x_train, y_train)
    predictions = xgb_model.predict(x_test)
    accuracy_scores += accuracy_score(y_test, predictions)
    f1_scores += f1_score(y_test, predictions, average="macro")

print(accuracy_scores/6)
print(f1_scores/6)

0.9827288428324698
0.9800396079006833


## Alternative classifiers

### Train another model, an SVC, to compare the results.

In [25]:
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)

accuracy_scores = 0.0
f1_scores = 0.0

for train_index, test_index in skf.split(scaled_x, y):
    x_train, x_test = scaled_x[train_index], scaled_x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    svc_model = SVC()
    svc_model.fit(x_train, y_train)
    predictions = svc_model.predict(x_test)
    accuracy_scores += accuracy_score(y_test, predictions)
    f1_scores += f1_score(y_test, predictions, average="macro")

print(accuracy_scores/6)
print(f1_scores/6)

0.9689119170984455
0.966884247502869


### Random Forest

In [26]:
accuracy_scores = 0.0
f1_scores = 0.0

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    rf_model = RandomForestClassifier()
    rf_model.fit(x_train, y_train)
    predictions = rf_model.predict(x_test)
    accuracy_scores += accuracy_score(y_test, predictions)
    f1_scores += f1_score(y_test, predictions, average="macro")

print(accuracy_scores/6)
print(f1_scores/6)

0.971502590673575
0.9641153080320543


### Logistic Regression

In [27]:
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)

accuracy_scores = 0.0
f1_scores = 0.0

for train_index, test_index in skf.split(scaled_x, y):
    x_train, x_test = scaled_x[train_index], scaled_x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lr_model = LogisticRegression()
    lr_model.fit(x_train, y_train)
    predictions = lr_model.predict(x_test)
    accuracy_scores += accuracy_score(y_test, predictions)
    f1_scores += f1_score(y_test, predictions, average="macro")

print(accuracy_scores/6)
print(f1_scores/6)

0.9835924006908462
0.9785058462360072


## Feature importance

XGBoost has another benefit: it is straightforward to retrieve feature importance scores. Let's take a look at those.

In [28]:
feature_names = df.drop("prediction", axis="columns").columns.to_numpy()
feature_importance_scores = xgb_model.feature_importances_


#### Top 5 highest

In [29]:
top_five_indices = np.argpartition(feature_importance_scores, (-5, -1))[-5:]
top_five_scores = feature_importance_scores[top_five_indices][::-1]
top_five_names = feature_names[top_five_indices][::-1]

for name, score in zip(top_five_names, top_five_scores):
    print(f"{name},   {score}")

_10Pagecount50,   0.3867165446281433
PPT_in_name,   0.08829636126756668
isopenaccesstitle,   0.05646290257573128
DOI_in_OA,   0.12333004176616669
keyword_creator,   0.05074840039014816


#### Score is zero

Some of the features have importance scores of 0. When we leave them out, we get the same results.

In [30]:
zero_score_indices = np.argwhere(feature_importance_scores == 0).flatten()
zero_score_features = feature_names[zero_score_indices]

for feature in zero_score_features:
    print(feature)

incollection
jstor
wordcount_o
Contains_DOI
Kleiner_10_paginas
Contains_sciencemag
Pagecount_bigger_50
Contains_to_appear_in
Contains_recommended_citation


In [31]:
features_to_remove = list(zero_score_features)
features_to_remove.append("prediction")

x = df.drop(features_to_remove, axis="columns").to_numpy()

print(x.shape)
print(x[:3])

(1158, 21)
[[1.30000e+01 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00]
 [1.35151e+05 4.14000e+02 0.00000e+00 1.34000e+02 1.00000e+01 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00]
 [7.43790e+04 1.94000e+02 0.00000e+00 1.94000e+02 1.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00000e+00]]


In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print(len(y_train))
print(len(y_test))


810
348


In [33]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train, y_train)

In [34]:
predictions = xgb_model.predict(x_test)

report = classification_report(y_test, predictions, target_names=label_encoder.classes_)
print(report)

                                  precision    recall  f1-score   support

        eigen materiaal - overig       0.86      1.00      0.92        18
    eigen materiaal - powerpoint       1.00      1.00      1.00        26
eigen materiaal - titelindicatie       1.00      1.00      1.00        52
                  korte overname       1.00      0.93      0.97        60
                  lange overname       1.00      0.97      0.99        40
            middellange overname       1.00      0.97      0.99       113
                        onbekend       0.83      1.00      0.91        24
                     open access       1.00      1.00      1.00        15

                        accuracy                           0.98       348
                       macro avg       0.96      0.99      0.97       348
                    weighted avg       0.98      0.98      0.98       348

