In [74]:
%matplotlib inline

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn import svm
import xgboost as xgb
import pandas as pd
import numpy as np
import os

In [6]:
# taken from: https://www.kaggle.com/code/nnikolay/diabetes-classification-recall-78
def summary(df):
    summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary = summary.reset_index()
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    return summary  

In [78]:
### import the dataset
## reading the texts comprehension csv file 
df = pd.read_csv("merged_comp_data.csv")

# leave out the columns that are not going to be investigated
df = df.drop(columns=['nrun',
    'blink',
    'reg.out', 
    'firstrun.skip',
    'firstrun.nfix',
    'firstrun.refix',
    'firstrun.reg.in',
    'firstrun.reg.out',
    'firstrun.gopast',
    'firstrun.gopast.sel',
    'firstfix.sac.in',
    'firstfix.sac.out',
    'firstfix.launch',
    'firstfix.land',
    'firstfix.cland',
    'singlefix',
    'singlefix.sac.in',
    'singlefix.sac.out',
    'singlefix.launch',
    'singlefix.land',
    'singlefix.cland',
    'singlefix.dur'   
 ])

# Repalce NaN values with zero's on all columns 
# the words that have been skipped (not looked at) have been annotated NA. 
# Since they were skipped, their gaze duration is equal or very close to 0
df = df.fillna(0)


In [79]:
# transform PoS categorical variables to numerical based on their occurence count
PoS_num = df['PoS'].value_counts()
df['PoS_num'] = df['PoS'].map(PoS_num)

## transform lang categorical varibles - encode the languages from strings to integers
le = preprocessing.LabelEncoder()
le.fit(df["lang"])
df["lang_num"] = le.transform(df["lang"])

In [80]:
# remove the English speakers from the dataset
df = df.drop(df[df['lang'] == 'en'].index)
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502873 entries, 0 to 502872
Data columns (total 18 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   index.data    502873 non-null  int64  
 1   uniform_id    502873 non-null  object 
 2   itemid        502873 non-null  int64  
 3   sentnum       502873 non-null  int64  
 4   ianum         502873 non-null  int64  
 5   ia            502873 non-null  object 
 6   PoS           502873 non-null  object 
 7   skip          502873 non-null  int64  
 8   reread        502873 non-null  float64
 9   nfix          502873 non-null  float64
 10  refix         502873 non-null  float64
 11  reg.in        502873 non-null  float64
 12  dur           502873 non-null  float64
 13  firstrun.dur  502873 non-null  float64
 14  firstfix.dur  502873 non-null  float64
 15  lang          502873 non-null  object 
 16  PoS_num       502873 non-null  int64  
 17  lang_num      502873 non-null  int32  
dtypes: f

In [36]:
summary(df)


Unnamed: 0,index,dtypes,Missing,Uniques
0,index.data,int64,0,502873
1,uniform_id,object,0,410
2,itemid,int64,0,12
3,sentnum,int64,0,11
4,ianum,int64,0,187
5,ia,object,0,793
6,PoS,object,0,31
7,skip,int64,0,2
8,reread,float64,0,2
9,nfix,float64,0,32


### Create the features and target variables
Note: the following code has been adapted from the Introduction to Machine Learning course jupyter notebooks

In [68]:
## Define the features and the targets of the model
feature_names = ['PoS_num', 'firstfix.dur', 'firstrun.dur', 'dur']
target_name = ['lang_num']


# Split the data into features (X) and target variable (y)
X = df[['PoS_num', 'firstfix.dur', 'firstrun.dur', 'dur']]  # feature columns
y = df['lang_num']  # target variable column

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [72]:
### DECISION TREE CLASSIFIER at different depths
max_depths = [4, 5, 6, 7, 8, 9, 10]
for depth in max_depths:
    tree = DecisionTreeClassifier(random_state=0, max_depth=depth)
    print("At depth of", depth)
    tree.fit(X_train, y_train)
    print("Depth:", tree.get_depth())
    print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
    print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

At depth of 4
Depth: 4
Accuracy on training set: 0.168
Accuracy on test set: 0.165
At depth of 5
Depth: 5
Accuracy on training set: 0.169
Accuracy on test set: 0.167
At depth of 6
Depth: 6
Accuracy on training set: 0.171
Accuracy on test set: 0.166
At depth of 7
Depth: 7
Accuracy on training set: 0.171
Accuracy on test set: 0.168
At depth of 8
Depth: 8
Accuracy on training set: 0.172
Accuracy on test set: 0.167
At depth of 9
Depth: 9
Accuracy on training set: 0.173
Accuracy on test set: 0.167
At depth of 10
Depth: 10
Accuracy on training set: 0.175
Accuracy on test set: 0.167


In [73]:
### CROSS VALIDATION DECISION TREE CLASSIFIER

# Define the range of maximum depths to try
max_depths = range(1, 10)

# Create an empty list to store the mean cross-validation scores
cv_scores = []

# Iterate over each maximum depth value
for depth in max_depths:
    # Create a decision tree classifier with the current maximum depth
    tree = DecisionTreeClassifier(max_depth=depth)
    
    # Perform cross-validation and calculate the mean accuracy score
    scores = cross_val_score(tree, X_train, y_train, cv=5)
    mean_score = np.mean(scores)
    
    # Append the mean score to the list of scores
    cv_scores.append(mean_score)
    
# Find the optimal maximum depth with the highest cross-validation score
optimal_depth = max_depths[np.argmax(cv_scores)]
best_score = max(cv_scores)

# Print the results
print("Optimal Maximum Depth DecisionTreeClassifier:", optimal_depth)
print("Best Cross-Validation Score:", best_score)

Optimal Maximum Depth DecisionTreeClassifier: 7
Best Cross-Validation Score: 0.16984423912915236


In [40]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model with L-BFGS solver
lgr = LogisticRegression(solver='lbfgs', multi_class='multinomial',  max_iter=1000)

# Train the model
lgr.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lgr.predict(X_test)

# Calculate accuracy on training data
accuracy_train = lgr.score(X_train, y_train)
print("Accuracy on training data:", accuracy_train)

# Calculate accuracy
accuracy = lgr.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy on training data: 0.16226528593231881
Accuracy: 0.160795426298782


### Use only 4 languages

In [81]:
# Split dataframe by rows in Estonian, Spanish, Italian, and Finnish
split_values = ['du', 'ge','no']

# Split the DataFrame using loc
celer_similar = df.loc[df['lang'].isin(split_values)]
celer_similar

Unnamed: 0,index.data,uniform_id,itemid,sentnum,ianum,ia,PoS,skip,reread,nfix,refix,reg.in,dur,firstrun.dur,firstfix.dur,lang,PoS_num,lang_num
0,34,du_4,1,1,1,Samuel,NNP,0,1.0,2.0,0.0,1.0,312.0,48.0,48.0,du,10021,0
1,35,du_4,1,1,2,"Morse,",NNP,0,0.0,3.0,1.0,1.0,408.0,408.0,117.0,du,10021,0
2,36,du_4,1,1,3,best,JJS,0,0.0,1.0,0.0,0.0,96.0,96.0,96.0,du,1654,0
3,37,du_4,1,1,4,known,VBN,0,0.0,1.0,0.0,0.0,216.0,216.0,216.0,du,18155,0
4,38,du_4,1,1,5,today,NN,0,0.0,1.0,0.0,0.0,164.0,164.0,164.0,du,76382,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400802,261814,ge_55,9,6,112,while,IN,0,1.0,3.0,1.0,0.0,466.0,288.0,207.0,ge,64601,4
400803,261815,ge_55,9,6,113,it,PRP,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ge,13857,4
400804,261816,ge_55,9,6,114,is,VBZ,0,0.0,1.0,0.0,0.0,202.0,202.0,202.0,ge,14178,4
400805,261817,ge_55,9,6,115,being,VBG,0,0.0,1.0,0.0,0.0,164.0,164.0,164.0,ge,12202,4


In [84]:
# Split the data into features (X) and target variable (y)
X = celer_similar[['PoS_num', 'firstfix.dur', 'firstrun.dur', 'dur']]  # feature columns
y = celer_similar['lang_num']  # target variable column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
# Define the range of maximum depths to try
max_depths = range(1, 10)

# Create an empty list to store the mean cross-validation scores
cv_scores = []

# Iterate over each maximum depth value
for depth in max_depths:
    # Create a decision tree classifier with the current maximum depth
    tree = DecisionTreeClassifier(max_depth=depth)
    
    # Perform cross-validation and calculate the mean accuracy score
    scores = cross_val_score(tree, X_train, y_train, cv=5)
    mean_score = np.mean(scores)
    
    # Append the mean score to the list of scores
    cv_scores.append(mean_score)
    
# Find the optimal maximum depth with the highest cross-validation score
optimal_depth = max_depths[np.argmax(cv_scores)]
best_score = max(cv_scores)

# Print the results
print("Optimal Maximum Depth DecisionTreeClassifier:", optimal_depth)
print("Best Cross-Validation Score:", best_score)

Optimal Maximum Depth DecisionTreeClassifier: 7
Best Cross-Validation Score: 0.4351739607788222


In [83]:
# Create a Logistic Regression model with L-BFGS solver
lgr = LogisticRegression(solver='lbfgs', multi_class='multinomial',  max_iter=1000)

# Train the model
lgr.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lgr.predict(X_test)


# Calculate accuracy on training data
accuracy_train = lgr.score(X_train, y_train)
print("Accuracy on training data:", accuracy_train)

# Calculate accuracy
accuracy = lgr.score(X_test, y_test)
print("Accuracy:", accuracy)


#### There seems to be an improvement when reducing the number of languages and grouping them into 
# groups of 2 unrelated languages of the same family -> it-sp vs. est-fin
# du-ge vs. tr-fi
# for 11 languages -> 0.16 accuracy score

Accuracy on training data: 0.40791358977185055
Accuracy: 0.4034900808229243
