In [122]:
%matplotlib inline

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn import svm
import xgboost as xgb
import pandas as pd
import numpy as np
import os

In [124]:
# taken from: https://www.kaggle.com/code/nnikolay/diabetes-classification-recall-78
def summary(df):
    summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary = summary.reset_index()
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    return summary  

In [131]:
### import the dataset
## reading the texts comprehension csv file 
df = pd.read_csv("merged_comp_data.csv")

## leave out the columns that are not going to be investigated
# df = df.drop(columns=['nrun',
#     'blink',
#     'reg.out', 
#     'firstrun.skip',
#     'firstrun.nfix',
#     'firstrun.refix',
#     'firstrun.reg.in',
#     'firstrun.reg.out',
#     'firstrun.gopast',
#     'firstrun.gopast.sel',
#     'firstfix.sac.in',
#     'firstfix.sac.out',
#     'firstfix.launch',
#     'firstfix.land',
#     'firstfix.cland',
#     'singlefix',
#     'singlefix.sac.in',
#     'singlefix.sac.out',
#     'singlefix.launch',
#     'singlefix.land',
#     'singlefix.cland',
#     'singlefix.dur'   
#  ])

# Repalce NaN values with zero's on all columns 
# the words that have been skipped (not looked at) have been annotated NA. 
# Since they were skipped, their gaze duration is equal or very close to 0
df = df.fillna(0)

# transform categorical variables to numerical based on their occurence
PoS_Dict = df['PoS'].value_counts()
df['PoS_Dict'] = df['PoS'].map(PoS_Dict)

## print and view main info about the dataframe
# print(df.info())
# df.head()

In [135]:
s = df[["PoS", "lang", "dur"]]



In [62]:
summary(df)


Unnamed: 0,index,dtypes,Missing,Uniques
0,index.data,int64,0,549872
1,uniform_id,object,0,450
2,itemid,int64,0,12
3,sentnum,int64,0,11
4,ianum,int64,0,187
5,ia,object,0,793
6,PoS,object,0,31
7,skip,int64,0,2
8,reread,float64,0,2
9,nfix,float64,0,33


In [82]:
randomized_df = df.sample(frac=1, random_state=42)

In [105]:
## encode the languages from strings to integers
le = preprocessing.LabelEncoder()
le.fit(randomized_df["lang"])
randomized_df["lang_num"] = le.transform(randomized_df["lang"])

feature_names = ['PoS_Dict', 'firstfix.dur', 'firstrun.dur', 'dur']
taget_name = ['lang_num']


# Split the data into features (X) and target variable (y)
X = randomized_df[['PoS_Dict', 'firstfix.dur', 'firstrun.dur', 'dur']]  # Replace with your feature columns
y = randomized_df['lang_num']  # Replace with your target variable column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model
model = xgb.XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the training and testing data
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

# Calculate the accuracy of the model on training and testing data
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)


In [75]:
# Create the gradient boosting model
model = GradientBoostingClassifier()

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')

# Print the average accuracy and standard deviation
print("Accuracy: {:.3f} +/- {:.3f}".format(scores.mean(), scores.std()))

Accuracy: 0.158 +/- 0.001


In [119]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Create a decision tree classifier
clf = DecisionTreeClassifier(max_depth= 9)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Make predictions on the training and testing data
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

# Calculate the accuracy of the model on training and testing data
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

Training Accuracy: 0.1613082153322255
Testing Accuracy: 0.15700841100250057


In [115]:
max_depths = [3, 4, 5, 6]
for depth in max_depths:
    tree = DecisionTreeClassifier(random_state=0, max_depth=depth)
    print("At depth of", depth)
    tree.fit(X_train, y_train)
    print("Depth:", tree.get_depth())
    print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
    print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

At depth of 3
Depth: 3
Accuracy on training set: 0.152
Accuracy on test set: 0.151
At depth of 4
Depth: 4
Accuracy on training set: 0.153
Accuracy on test set: 0.153
At depth of 5
Depth: 5
Accuracy on training set: 0.154
Accuracy on test set: 0.154
At depth of 6
Depth: 6
Accuracy on training set: 0.156
Accuracy on test set: 0.154


In [121]:
# Define the range of maximum depths to try
max_depths = range(1, 10)

# Create an empty list to store the mean cross-validation scores
cv_scores = []

# Iterate over each maximum depth value
for depth in max_depths:
    # Create a decision tree classifier with the current maximum depth
    tree = DecisionTreeClassifier(max_depth=depth)
    
    # Perform cross-validation and calculate the mean accuracy score
    scores = cross_val_score(tree, X_train, y_train, cv=5)
    mean_score = np.mean(scores)
    
    # Append the mean score to the list of scores
    cv_scores.append(mean_score)
    
# Find the optimal maximum depth with the highest cross-validation score
optimal_depth = max_depths[np.argmax(cv_scores)]
best_score = max(cv_scores)

# Print the results
print("Optimal Maximum Depth:", optimal_depth)
print("Best Cross-Validation Score:", best_score)


Optimal Maximum Depth: 9
Best Cross-Validation Score: 0.15657756093456562
