## Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import xgboost as xgb

## Read data and take a look

Let's start by looking at what columns we have, what their data types are and how many null-values there are.

In [11]:
df = pd.read_csv(
    './outputSURF-AI-testset.csv',
    sep=';'
)

df.drop_duplicates(inplace=True, ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1601 entries, 0 to 1600
Data columns (total 80 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   FacultyName                    1600 non-null   object 
 1   CourseName                     1600 non-null   object 
 2   PredictionSurf                 1195 non-null   object 
 3   PredictionInstitution          876 non-null    object 
 4   PredictionRemark               360 non-null    object 
 5   CorrectDoi                     69 non-null     object 
 6   CorrectISBN                    71 non-null     object 
 7   AnalyseError                   385 non-null    object 
 8   CorrectAnalyseSurf             1601 non-null   int64  
 9   CorrectAnalyseInstitution      1601 non-null   int64  
 10  AnalyseISBN                    1600 non-null   float64
 11  AnalyseDOI                     1600 non-null   float64
 12  id                             1600 non-null   f

In [4]:
df.head()

Unnamed: 0,PredictionSurfNormalized,PredictionToolNormalized,FacultyName,CourseName,PredictionSurf,PredictionInstitution,PredictionRemark,CorrectDoi,CorrectISBN,Level2,...,Contains_sciencemag,Pagecount_bigger_50,BookAndWords10000,Contains_published_in,Contains_researchgate,Contains_to_appear_in,IsJournalWords8000,images_same_pagecount,Publisher_from_crossref,Contains_recommended_citation
0,,,,Metabolic Consequences of Chronic Diseases wit...,,,,,,Human Nutrition and Health,...,,,,,,,,,,
1,,Eigen Materiaal,,Disease Ecology,,,,,,Wildlife Ecology and Conservation Group,...,False,False,False,False,False,False,False,False,False,False
2,,,,Agrobiodiversity,,,,,,Soil Biology,...,,,,,,,,,,
3,,,,Management Skills in Theory & Practice,,,,,,Education and Learning Sciences,...,,,,,,,,,,
4,,,,Thesis Skills,,,,,,Rural Sociology,...,,,,,,,,,,


#### The following columns contain only null values and will not be used:

<ul>
    <li> 'issn'
    <li> 'usedpages'
    <li> 'filepath'
    <li> 'runidentifier'
    <li> '_10_pics_page'
    <li> 'Kolom1'
</ul>


In [None]:
for col in df.columns:
    if df[col].isna().all():
        print(col)
df.dropna(axis=1, how="all", inplace=True)

#### The following columns concern the predictions and will not be used as input:

<ul>
    <li> 'PredictionSurf'
    <li> 'PredictionInstitution' (ground truth)
    <li> 'PredictionRemark'
    <li> 'prediction'
</ul>

The column 'PredictionInstitution' will be used as label, therefore all rows where this column has a null-value will be dropped. The others will be dropped completely.

In [None]:
df.drop(['PredictionSurf', 'prediction', 'PredictionRemark'], axis="columns", inplace=True)
df.dropna(subset=['PredictionInstitution'], inplace=True)

#### The following columns seem to contain identifiers and the like:

<ul>
    <li> 'FacultyName'
    <li> 'CourseName'
    <li> 'CourseName'
    <li> 'CorrectDoi'
    <li> 'CorrectISBN'
    <li> 'AnalyseError'
    <li> 'CorrectAnalyseSurf'
    <li> 'CorrectAnalyseInstitution'
    <li> 'AnalyseISBN'
    <li> 'AnalyseDOI'
    <li> 'id'
    <li> 'uuid'
    <li> 'url'
    <li> 'filesource'
    <li> 'filestatus'
    <li> 'filemimetype'
    <li> 'filename'
    <li> 'filehash'
    <li> 'filedate'
    <li> 'lastmodifieddate'
    <li> 'creator'
    <li> 'isfilepublished'
    <li> 'filescanresults'
    <li> 'doi'
    <li> 'isbn'
    <li> 'author'
    <li> 'title'
    <li> 'publicationyear'
    <li> 'oclcnumber'
</ul>

These columns will not be used.

#### I PLAN TO TAKE A CLOSER LOOK AT AT LEAST SOME OF THESE, like title which might become a useful feature with the application of NLP, or 'CorrectAnalyse' columns, which may tell us something about the tool's performance.

In [None]:
identifier_cols = ['FacultyName', 'CourseName', 'CorrectDoi', 'CorrectISBN',
'AnalyseError', 'CorrectAnalyseSurf', 'CorrectAnalyseInstitution',
'AnalyseISBN', 'AnalyseDOI', 'id', 'uuid', 'url', 'filesource',
'filestatus', 'filemimetype', 'filename', 'filehash', 'filedate',
'lastmodifieddate', 'creator', 'isfilepublished', 'filescanresults',
'doi', 'isbn', 'author', 'title', 'publicationyear', 'oclcnumber']

df.drop(identifier_cols, axis="columns", inplace=True)

## Let's take another look at our columns

In [None]:
df.info()

#### Let's take a separate look at columns with dtype object

In [None]:
object_cols = [col for col in df.columns if df[col].dtype == 'object']
len(object_cols)

Most columns with dtype object contain boolean values. We will change these to 0s and 1s and change their dtype to int.

In [None]:
for col in object_cols:
    if set(df[col].dropna().unique()) == {False, True}:
        df.loc[df[col] == True, col] = 1
        df.loc[df[col] == False, col] = 0
        df[col] = df[col].fillna(0)
        df[col] = df[col].astype("int")
    elif len(set(df[col].dropna().unique())) == 1:
        df.drop(col, axis=1, inplace=True)

## Make a profiling report

Uncomment and run this cell to get a pandas profiling report. This will show nicely which features are correlated.

In [None]:
# import sys
# !{sys.executable} -m pip install -U pandas-profiling[notebook]
# !jupyter nbextension enable --py widgetsnbextension
# from pandas_profiling import ProfileReport

# profile = ProfileReport(df.reset_index(drop=True), title="Pandas Profiling Report")
# profile.to_file("pandas_report1.html")

### Constant values

There are 2 columns with a constant value: 'sourcepagecount' and 'sourcewordcount'. These columns will be dropped.

#### QUESTION: What are these columns? How does it relate to 'pagecount' and 'wordcount'?

In [None]:
df.drop(["sourcepagecount", "sourcewordcount"], axis=1, inplace=True)

#### Last columns of dtype object

The last three columns of dtype object consist of 2 non-boolean features and 1 column with the predictions. 

In [None]:
object_cols = [col for col in df.columns if df[col].dtype == 'object']
len(object_cols)

In [None]:
for col in object_cols:
    print(f"column name: {col}")
    print(f"number of unique values: {len(df[col].unique())}")
    print(f"unique values: {df[col].unique()}")
    print(f"number of null values: {df[col].isna().sum()}")
    print("----------------------")

#### Drop openaccesslink

The 'openaccesslink' feature has 15 unique values, which are uniformly distributed and only once or twice each. This feature has 98% missing values; whether this value is missing highly correlates with the value of 'isopenaccesstitle'.

In [None]:
oa = df.loc[df["isopenaccesstitle"] == 1]
print(f"Number of datapoints that are open access titles:\t\t{len(oa)}.\nNumber of open access titles that have an open access link:\t{oa['openaccesslink'].notna().sum()}.")

print("==========================================================================")

not_oa = df.loc[df["isopenaccesstitle"] == 0]
print(f"Number of datapoints that are NOT open access titles:\t\t{len(not_oa)}.\nNumber of non open access titles that have an open access link:\t{not_oa['openaccesslink'].notna().sum()}.")

In [None]:
df.drop("openaccesslink", axis="columns", inplace=True)

#### Drop publisher

The 'publisher' feature is highly correlated with many features, such as wordcount, contains_researchgate, Words_more_300pp and several others. It also has 93% missing values and 33 unique values.

In [None]:
df.drop("publisher", axis="columns", inplace=True)

In [None]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True, ignore_index=True)

## Labels' support

We lost many entries, mostly because they had no value for 'PredictionInstitution', which is our ground truth. Let's take a look at the support of each label, i.e., how often does each label occur in the dataset.

In [None]:
len(df)

In [None]:
labels = df["PredictionInstitution"]
unique_labels = labels.unique()

for label in unique_labels:
    print('{:<32}  {:>3}'.format(label, (labels == label).sum()))

#### QUESTION: Difference overname open access and open access? zelfde!!   eigen materiaal ook op een hoop (behalve powerpoint)

We don't have enough examples of each class. Let's see what happens if we drop columns with 1 or 2 samples and train only on labels we have more examples of. We also change the label 'overname middellang' to 'middellange overname', since this was probably a labeling error.

In [None]:
df_label_subset = df[df.PredictionInstitution != "overname open access"]
df_label_subset = df_label_subset[df_label_subset.PredictionInstitution != "mogelijk licentie"]
df_label_subset = df_label_subset[df_label_subset.PredictionInstitution != "overname met licentie"]

df_label_subset.loc[df_label_subset["PredictionInstitution"] == "overname middellang", "PredictionInstitution"] = "middellange overname"

labels = df_label_subset["PredictionInstitution"]
unique_labels = labels.unique()

for label in unique_labels:
    print('{:<32}  {:>3}'.format(label, (labels == label).sum()))

In [None]:
x = df_label_subset.drop("PredictionInstitution", axis="columns").to_numpy()

print(x.shape)
print(x[:3])

In [None]:
label_encoder = LabelEncoder()

y = label_encoder.fit_transform(df_label_subset["PredictionInstitution"])

print(y.shape)
print(y[:3])

## XGBoost model

### High correlation

There are many features with high correlation. Since tree-based models are not so sensitive to this, let's train XGBoost on our data. This model achieves both accuracy and F1 score of 0.98.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(len(y_train))
print(len(y_test))

In [None]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train, y_train)

predictions = xgb_model.predict(x_test)
report = classification_report(y_test, predictions, target_names=label_encoder.classes_, labels=np.unique(y_train))
print(report)

#### Labels' support

We don't have enough examples of each class. In an attempt to combat this, we will add some datapoints to our training data that do not have a value for 'PredictionInstitution', but DO have a value for 'prediction'. We also process it the same way we processed the rest of the data above, by removing the same columns etc.

In [None]:
df2 = pd.read_csv(
    './outputSURF-AI-testset.csv',
    sep=';'
)

df2.drop_duplicates(inplace=True, ignore_index=True)

df2.dropna(axis=1, how="all", inplace=True)

df2 = df2[df2['PredictionInstitution'].isna()]

df2.drop(['PredictionSurf', 'PredictionInstitution', 'PredictionRemark'], axis="columns", inplace=True)
df2.dropna(subset=['prediction'], inplace=True)

drop_cols = ['FacultyName', 'CourseName', 'CorrectDoi', 'CorrectISBN',
'AnalyseError', 'CorrectAnalyseSurf', 'CorrectAnalyseInstitution',
'AnalyseISBN', 'AnalyseDOI', 'id', 'uuid', 'url', 'filesource',
'filestatus', 'filemimetype', 'filename', 'filehash', 'filedate',
'lastmodifieddate', 'creator', 'isfilepublished', 'filescanresults',
'doi', 'isbn', 'author', 'title', 'publicationyear',
'filetype', 'oclcnumber', 'sourcepagecount', 'sourcewordcount', 'publisher', 'openaccesslink', 'Contains_sciencemag', 'creator_abbyy']

df2.drop(drop_cols, axis="columns", inplace=True)

object_cols = [col for col in df2.columns if df2[col].dtype == 'object']

for col in object_cols:
    if set(df2[col].dropna().unique()) == {False, True}:
        df2.loc[df2[col] == True, col] = 1
        df2.loc[df2[col] == False, col] = 0
        df2[col] = df2[col].fillna(0)
        df2[col] = df2[col].astype("int")
    elif len(set(df2[col].dropna().unique())) == 1:
        df2.drop(col, axis=1, inplace=True)

df2.dropna(inplace=True)
df2.drop_duplicates(inplace=True, ignore_index=True)

df2.info()

In [None]:
x_train = df2.drop("prediction", axis="columns").to_numpy()

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df2["prediction"])

print(x_train.shape)
print(y_train.shape)

In [None]:
x_test = x
y_test = y

print(len(y_train))
print(len(y_test))

In [None]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train, y_train)

predictions = xgb_model.predict(x_test)
report = classification_report(y_test, predictions, target_names=label_encoder.classes_, labels=np.unique(y_train))
print(report)

#### Add to train set

Instead of treating the data with 'prediction' as its label as the whole training set, let's split the data with 'PredictionInstitution' and add 'prediction' data to the train split.

In [None]:
df3 = pd.read_csv(
    './outputSURF-AI-testset.csv',
    sep=';'
)

df3.drop_duplicates(inplace=True, ignore_index=True)

df3.dropna(axis=1, how="all", inplace=True)

df3.drop(['PredictionSurf', 'prediction', 'PredictionRemark'], axis="columns", inplace=True)
df3.dropna(subset=['PredictionInstitution'], inplace=True)

drop_cols = ['FacultyName', 'CourseName', 'CorrectDoi', 'CorrectISBN',
'AnalyseError', 'CorrectAnalyseSurf', 'CorrectAnalyseInstitution',
'AnalyseISBN', 'AnalyseDOI', 'id', 'uuid', 'url', 'filesource',
'filestatus', 'filemimetype', 'filename', 'filehash', 'filedate',
'lastmodifieddate', 'creator', 'isfilepublished', 'filescanresults',
'doi', 'isbn', 'author', 'title', 'publicationyear',
'filetype', 'oclcnumber', 'sourcepagecount', 'sourcewordcount', 'publisher', 'openaccesslink', 'Contains_sciencemag', 'creator_abbyy']

df3.drop(drop_cols, axis="columns", inplace=True)

object_cols = [col for col in df3.columns if df3[col].dtype == 'object']

for col in object_cols:
    if set(df3[col].dropna().unique()) == {False, True}:
        df3.loc[df3[col] == True, col] = 1
        df3.loc[df3[col] == False, col] = 0
        df3[col] = df3[col].fillna(0)
        df3[col] = df3[col].astype("int")
    elif len(set(df3[col].dropna().unique())) == 1:
        df3.drop(col, axis=1, inplace=True)

df3.dropna(inplace=True)
df3.drop_duplicates(inplace=True, ignore_index=True)

df3.info()

In [None]:
len(df3["PredictionInstitution"].unique())

In [None]:
x = df3.drop("PredictionInstitution", axis="columns").to_numpy()

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df3["PredictionInstitution"])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)


In [None]:
x2 = df2.drop("prediction", axis="columns").to_numpy()
y2 = label_encoder.transform(df2["prediction"])

x_train_augmented = np.vstack((x_train, x2))
y_train_augmented = np.concatenate((y_train, y2))

There are still underrepresented labels, though it's less than before.

In [None]:
for i in np.unique(y_test):
    print(i, (y_train_augmented == i).sum())

In [None]:
label_encoder.inverse_transform([8,9,10])

In [None]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train_augmented, y_train_augmented, labels=np.unique(y_train))

predictions = xgb_model.predict(x_test)
report = classification_report(y_test, predictions, target_names=label_encoder.classes_, labels=np.unique(y_train))
print(report)

#### With K-fold

Use K-fold to confirm that the accuracy and F1 is actually that high.

In [None]:
skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)

accuracy_scores = 0.0
f1_scores = 0.0

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(x_train, y_train)
    predictions = xgb_model.predict(x_test)
    accuracy_scores += accuracy_score(y_test, predictions)
    f1_scores += f1_score(y_test, predictions, average="macro")

print(accuracy_scores/6)
print(f1_scores/6)

## Alternative classifiers

### Train another model, an SVC, to compare the results.

In [None]:
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)

accuracy_scores = 0.0
f1_scores = 0.0

for train_index, test_index in skf.split(scaled_x, y):
    x_train, x_test = scaled_x[train_index], scaled_x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    svc_model = SVC()
    svc_model.fit(x_train, y_train)
    predictions = svc_model.predict(x_test)
    accuracy_scores += accuracy_score(y_test, predictions)
    f1_scores += f1_score(y_test, predictions, average="macro")

print(accuracy_scores/6)
print(f1_scores/6)

### Random Forest

In [None]:
accuracy_scores = 0.0
f1_scores = 0.0

for train_index, test_index in skf.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    rf_model = RandomForestClassifier()
    rf_model.fit(x_train, y_train)
    predictions = rf_model.predict(x_test)
    accuracy_scores += accuracy_score(y_test, predictions)
    f1_scores += f1_score(y_test, predictions, average="macro")

print(accuracy_scores/6)
print(f1_scores/6)

### Logistic Regression

In [None]:
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)

accuracy_scores = 0.0
f1_scores = 0.0

for train_index, test_index in skf.split(scaled_x, y):
    x_train, x_test = scaled_x[train_index], scaled_x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lr_model = LogisticRegression()
    lr_model.fit(x_train, y_train)
    predictions = lr_model.predict(x_test)
    accuracy_scores += accuracy_score(y_test, predictions)
    f1_scores += f1_score(y_test, predictions, average="macro")

print(accuracy_scores/6)
print(f1_scores/6)

## Feature importance

XGBoost has another benefit: it is straightforward to retrieve feature importance scores. Let's take a look at those.

In [None]:
feature_names = df.drop("prediction", axis="columns").columns.to_numpy()
feature_importance_scores = xgb_model.feature_importances_


#### Top 5 highest

In [None]:
top_five_indices = np.argpartition(feature_importance_scores, (-5, -1))[-5:]
top_five_scores = feature_importance_scores[top_five_indices][::-1]
top_five_names = feature_names[top_five_indices][::-1]

for name, score in zip(top_five_names, top_five_scores):
    print(f"{name},   {score}")

#### Score is zero

Some of the features have importance scores of 0. When we leave them out, we get the same results.

In [None]:
zero_score_indices = np.argwhere(feature_importance_scores == 0).flatten()
zero_score_features = feature_names[zero_score_indices]

for feature in zero_score_features:
    print(feature)

In [None]:
features_to_remove = list(zero_score_features)
features_to_remove.append("prediction")

x = df.drop(features_to_remove, axis="columns").to_numpy()

print(x.shape)
print(x[:3])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print(len(y_train))
print(len(y_test))


In [None]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train, y_train)

In [None]:
predictions = xgb_model.predict(x_test)

report = classification_report(y_test, predictions, target_names=label_encoder.classes_)
print(report)