# Contains Neural Network and IID Cross-Validation

In [1]:
import numpy as np
import pandas as pd

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [2]:
#reading dataframe
df = pd.read_csv("dataset/phase_3_TRAIN_7d499bff69ca69b6_6372c3e_MLPC2021_generic.csv")

In [3]:
# drop target value, student annotations and string ID from input features:
X = df.drop(columns=['quadrant','mean_A','mean_V','id','score_mode','score_key_strength'])

# we want to predict the quadrant:
y = df['quadrant'].values

In [4]:
#splitting by pianist and piecw!

#create tags_dataframe:
X_tags=pd.DataFrame()
X_tags['id']=df['id']


#extract piece_id and pianist to later allow by piece/pianist/both cross validation
def extractPianist(x):
    return x[0:2]
def extract_piece_id(x):
    return x[3:5]
def extract_snippet_number(x):
    return x[6:9]

X_tags['Pianist']=X_tags['id'].apply(extractPianist)
X_tags['Piece_id']=X_tags['id'].apply(extract_piece_id)
X_tags['Snippet_number']=X_tags['id'].apply(extract_snippet_number)

X_tags.head()

Unnamed: 0,id,Pianist,Piece_id,Snippet_number
0,GG-01-000,GG,1,0
1,GG-01-001,GG,1,1
2,GG-01-002,GG,1,2
3,GG-01-003,GG,1,3
4,GG-01-004,GG,1,4


In [5]:
#get list of pianists and pieces!
pianist_list=list(set(X_tags['Pianist']))
piece_list=list(set(X_tags['Piece_id']))
pianist_list[0]

'SR'

In [6]:
#by pianist cross validation
score=0
for pianist in pianist_list:
    mask1=X_tags['Pianist']!=pianist
    mask2=X_tags['Pianist']==pianist
    X_train, y_train=X[mask1],y[mask1]
    X_test, y_test=X[mask2],y[mask2]
    #print(X_train)
    clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)
    score+=clf.score(X_test, y_test)
    #print(clf.score(X_test, y_test))

In [7]:
pianist_cross_validation_score=score/len(pianist_list)
print(pianist_cross_validation_score)

0.5113969192460408


In [8]:
#by piece cross validation
#split dataset into train and test data
score=0
for piece in piece_list:
    mask1=X_tags['Piece_id']!=piece
    mask2=X_tags['Piece_id']==piece
    #print(len(mask1))
    X_train, y_train=X[mask1],y[mask1]
    X_test, y_test=X[mask2],y[mask2]
    clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)
    score+=clf.score(X_test, y_test)
    #print(clf.score(X_test, y_test))

In [9]:
piece_cross_validation_score=score/len(piece_list)
print(piece_cross_validation_score)

0.40084637497575004


In [14]:
#by piece and pianist cross validation
#split dataset into train and test data
score=0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
c=0
for i, piece in enumerate(piece_list):
    for j, pianist in enumerate(pianist_list):
        c+=1
        # we can not do ((X_tags['Piece_id'] ==piece) & (X_tags['Pianist']==pianist)) and !((X_tags['Piece_id'] ==piece) & (X_tags['Pianist']==pianist))
        # because the we would have unwanted correaltions between 
        #((X_tags['Piece_id'] ==piece) & (X_tags['Pianist']==pianist))
        #and ((X_tags['Piece_id'] !=piece) & (X_tags['Pianist']==pianist))
        mask2 = ((X_tags['Piece_id'] ==piece) & (X_tags['Pianist']==pianist))
        mask1 = ((X_tags['Piece_id'] !=piece) & (X_tags['Pianist']!=pianist))
                         
        X_train, y_train=X[mask1],y[mask1]
        X_test, y_test=X[mask2],y[mask2]
        #print(X_train)
        clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)
        score+=clf.score(X_test, y_test)
        if c%10==0:
            print(f'percent done: {(c)/(len(pianist_list)*len(piece_list))*100}')
    


percent done: 4.62962962962963
percent done: 9.25925925925926
percent done: 13.88888888888889
percent done: 18.51851851851852
percent done: 23.14814814814815
percent done: 27.77777777777778
percent done: 32.407407407407405
percent done: 37.03703703703704
percent done: 41.66666666666667
percent done: 46.2962962962963
percent done: 50.92592592592593
percent done: 55.55555555555556
percent done: 60.18518518518518
percent done: 64.81481481481481
percent done: 69.44444444444444
percent done: 74.07407407407408
percent done: 78.70370370370371
percent done: 83.33333333333334
percent done: 87.96296296296296
percent done: 92.5925925925926
percent done: 97.22222222222221


In [15]:
both_cross_validation_score=score/(len(pianist_list)*len(piece_list))
print(both_cross_validation_score)

0.3824423747226514


In [None]:
# the score is slightly lower, but one must keep in mind,
# that in order to eliminate all correlations between test and training set
# we dropped (!pianist and piece) and (pianist and !piece) 
# in other words our training set size was quite a bit smaller!