# Imports and Data Loading and Analysis

In [76]:
#Interactive 3d plots?
interactive=True


# Imports
import numpy as np
import pandas as pd
import os
os.environ["OMP_NUM_THREADS"] = "2"
import glob
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from mpl_toolkits import mplot3d
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix


if interactive:
    %matplotlib qt

# Set paths
baseDir = os.path.join(os.getcwd(),'..')
codeDir = os.path.join(baseDir, 'code')
dataDir = os.path.join(baseDir, 'data')
stimDir = os.path.join(baseDir, 'BOLD5000_Stimuli_Shared')

N_TUNING_CURVES = 8

qualtricsData = pd.read_csv(os.path.join(dataDir, "qualtricsDataClean.csv"),index_col='participant')
masterData = pd.read_csv(os.path.join(dataDir, "masterDataClean.csv"),low_memory=False,index_col='participant')
lingDirectionsAverage = pd.read_csv(os.path.join(dataDir, "lingDirectionsAverage.csv"),index_col='presentedImage')
lingDirectionsParticipant = pd.read_csv(os.path.join(dataDir, "lingDirectionsParticipant.csv"),index_col=['presentedImage','participant'])
analogDirectionsParticipant = pd.read_csv(os.path.join(dataDir, f'analogData_{N_TUNING_CURVES}_bins_Participant.csv'),index_col=['presentedImage','participant'])
analogDirectionsAverage = pd.read_csv(os.path.join(dataDir, f'analogData_{N_TUNING_CURVES}_bins_Average.csv'),index_col='presentedImage')

# Test that lists of images match each other
image_list_analog = analogDirectionsAverage.index.to_list()
image_list_ling = lingDirectionsAverage.index.to_list()

assert(image_list_analog==image_list_ling)

Unnamed: 0_level_0,ahead,right,left,sharp_right,slight_right,slight_left,sharp_left
presentedImage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ATM1.jpg,0.0,0.25,0.5,0.25,0.0,0.0,0.5
ATM4.jpg,0.714286,0.0,0.142857,0.0,0.142857,0.285714,0.142857
HorseRaceTrack.jpg,0.0,0.0,0.25,0.5,0.0,0.25,0.5
RVinside2.jpg,1.0,0.142857,0.0,0.0,0.285714,0.142857,0.0
ShowJumping7.jpg,0.75,0.5,0.5,0.0,0.25,0.5,0.0


# Spearman Correlation between the Analog and Linguisitic Similarity Matrix

In [47]:
# Calculate the Similarity between Linguistic and Analog Data
def upper(df):
    try:
        assert(type(df)==np.ndarray)
    except:
        if type(df)==pd.DataFrame:
            df = df.values
        else:
            raise TypeError('Must be np.ndarray or pd.DataFrame')
    mask = np.triu_indices(df.shape[0], k=1)
    return df[mask]

# Spearman Similarity Between Two Cosine Similarity Matrix
def calculateSimilarity(N_TUNING_CURVES):
    df_analog = pd.read_csv(os.path.join(dataDir, f'analogData_{N_TUNING_CURVES}_bins_Average.csv'),index_col=0)
    df_ling = pd.read_csv(os.path.join(dataDir, f'lingDirectionsAverage.csv'),index_col=0)
    
    df_analog_similarity = pd.DataFrame(cosine_similarity(df_analog),index=df_analog.index,columns=df_analog.index)
    df_ling_similarity = pd.DataFrame(cosine_similarity(df_ling),index=df_ling.index,columns=df_ling.index)
    
    analog_upper = upper(df_analog_similarity) 
    ling_upper = upper(df_ling_similarity)
    
    # Now lets measure the Spearman similarity
    result = stats.spearmanr(analog_upper,ling_upper)
    corr = result.correlation
    p_val = result.pvalue 
    print(f'Spearman Correlation: Analog {N_TUNING_CURVES} tuning curves and linguistic averages, r = {corr:.4f}, p = {p_val:.4f}')
    return result,analog_upper,ling_upper

In [77]:
# Printing Similarity Between the different Analog and Linguistic Cosine Similarity Matrix

result_16,analog_upper,ling_upper = calculateSimilarity(16)

list_tuning_curves=[8,16,37]
for i in list_tuning_curves:
    calculateSimilarity(i)
    


Spearman Correlation: Analog 16 tuning curves and linguistic averages, r = 0.4029, p = 0.0000
Spearman Correlation: Analog 8 tuning curves and linguistic averages, r = 0.4033, p = 0.0000
Spearman Correlation: Analog 16 tuning curves and linguistic averages, r = 0.4029, p = 0.0000
Spearman Correlation: Analog 37 tuning curves and linguistic averages, r = 0.3941, p = 0.0000


In [78]:
import random 
index = np.arange(len(analog_upper))
corrs = []
for i in np.arange(1,100):
    random.shuffle(index)
    result = stats.spearmanr(analog_upper[index],ling_upper)
    corrs.append(result.correlation)


In [79]:
print(result_16.correlation)
plt.hist(corrs)
plt.axvline(x=result_16.correlation,color='red',linestyle='--')
plt.show()

0.4029312207963288


In [81]:
seed = 1

# Optimal number Of clusters
def OptimalClusterElbowMethod(df,max_clusters=15):
    cs = []
    plt.figure(figsize=(10,6))
    for i in range(1, max_clusters):
        kmeans = KMeans(n_clusters = i, init = 'k-means++',random_state=seed)
        kmeans.fit(df)
        cs.append(kmeans.inertia_)
    
    # Plot fits by cluster number
    plt.plot(range(1, max_clusters), cs)
    plt.title('The Elbow Method')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.show()
    
# Cluster the Data into 3 groups.
def clustering(df, clusters, show_elbow=True):
    
    if show_elbow:
        OptimalClusterElbowMethod(df)

    # Number of Optimized clusters here would be 3.
    # defining the kmeans function with initialization as k-means++
    kmeans = KMeans(n_clusters=clusters, init='k-means++',random_state=seed)

    # fitting the k means algorithm on scaled data
    kmeans.fit(df)

    pred = kmeans.predict(df)
    df['cluster'] = pred
    df['cluster'].value_counts()
    

    
    #Getting unique labels
    label = np.unique(pred)

    return df


# Clustering Linguistic and analog Data and Plot the average Clusters

In [82]:

# Plot the Clustering by taking the average and merge into one 
# right, slight_right and sharp_right merge into right_average 
# left, slight_left and sharp_left merge into left_average 
# Including the Ahead in the the Right Side

def plotClustersData(df,clusters):
    
    df = clustering(df, clusters)
    
    print(df.groupby(['cluster']).mean())
    
    colors = ['blue','orange','purple','red','green','yellow','orange','black','magenta','pink','grey']

    assert(clusters < len(colors))
    
    # Creating figure
    fig = plt.figure(figsize = (16, 9))
    ax = plt.axes(projection ="3d")

    # Add x, y gridlines
    ax.grid(visible = True, color ='grey',
            linestyle ='-.', linewidth = 0.3,
            alpha = 0.2)
    
    df['right_average'] = df.loc[:, ["right","sharp_right","slight_right"]].mean(axis = 1)
    df['left_average'] = df.loc[:, ["left","sharp_left","slight_left",]].mean(axis = 1)

    # Creating plot
    df0 = df[df['cluster'] == 0]
    sctt = ax.scatter3D(df0['right_average'],
                        df0['left_average'], 
                        df0['ahead'],
                        alpha = 0.8,
                        c = colors[0],
                        label=f'1 Cluster')

    #Print the Clusters 
    for i in np.arange(1,clusters):
        df_i = df[df['cluster'] == i]

    
        sctt=ax.scatter3D(df_i['right_average'],
                  df_i['left_average'], 
                  df_i['ahead'],
                  alpha = 0.8,
                  c = colors[i],
                  label=f'{i} Cluster')
    
    plt.title("Average direction by cluster")
    ax.set_xlabel('Right Average', fontweight ='bold')
    ax.set_ylabel('Left Average', fontweight ='bold')
    ax.set_zlabel('Ahead Average', fontweight ='bold')
    ax.view_init(0,70)
    
    # show plot
    plt.show()
    
    

In [84]:
plotClustersData(lingDirectionsAverage,3)

# SVM Classification for Prediction on Analog Data and Linguistic Data

In [89]:
def ClassificationData(df1,df2,clusters,show_plot=False):
    df1_cluster = clustering(df1,clusters,show_elbow=False)
    df2_cluster = clustering(df2,clusters,show_elbow=False)
    
    
    # Declare feature vector and target variable
    X = df1_cluster.drop(['cluster'], axis=1)
    y = df2_cluster['cluster']

    # split X and y into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify=y )

    # check the shape of X_train and X_test
    print("Checking Shape of Data")
    print(X_train.shape)
    print(X_test.shape)

    # instantiate classifier with default hyperparameters
    svc=SVC(kernel='rbf', C=10000.0) 

    # fit classifier to training set
    svc.fit(X_train,y_train)

    # make predictions on test set
    y_pred=svc.predict(X_test)

    # compute and print accuracy score
    print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

    # Print the Confusion Matrix and slice it into four pieces
    cm = confusion_matrix(y_test, y_pred)
    #print('Confusion matrix\n\n', cm)
    
    if show_plot:
        plot_confusion_matrix(svc,X_test,y_test)

    #Print the Classification Report
    print ("Classification Report is")
    print(classification_report(y_test, y_pred))
    
    return accuracy_score(y_test,y_pred)
    
scores = []
for i in np.arange(3,12):
    scores.append(ClassificationData(analogDirectionsAverage,lingDirectionsAverage,i,False))
    
fig = plt.figure(figsize = (16, 9))
plt.plot(scores)
fig.show()

Checking Shape of Data
(400, 7)
(100, 7)
Model accuracy score: 0.4600
Classification Report is
              precision    recall  f1-score   support

           0       0.52      0.53      0.53        43
           1       0.52      0.50      0.51        24
           2       0.33      0.33      0.33        33

    accuracy                           0.46       100
   macro avg       0.46      0.46      0.46       100
weighted avg       0.46      0.46      0.46       100

Checking Shape of Data
(400, 7)
(100, 7)
Model accuracy score: 0.4200
Classification Report is
              precision    recall  f1-score   support

           0       0.39      0.42      0.41        33
           1       0.48      0.58      0.53        24
           2       0.36      0.36      0.36        25
           3       0.50      0.28      0.36        18

    accuracy                           0.42       100
   macro avg       0.43      0.41      0.41       100
weighted avg       0.42      0.42      0.41      

# Classifier on Analog Data

In [91]:
scores = []
for i in np.arange(3,12):
    scores.append(ClassificationData(analogDirectionsAverage, analogDirectionsAverage, i))
    
fig = plt.figure(figsize = (16, 9))
plt.plot(scores)
fig.show()


Checking Shape of Data
(400, 9)
(100, 9)
Model accuracy score: 0.9200
Classification Report is
              precision    recall  f1-score   support

           0       0.89      0.89      0.89        36
           1       0.91      1.00      0.95        31
           2       0.97      0.88      0.92        33

    accuracy                           0.92       100
   macro avg       0.92      0.92      0.92       100
weighted avg       0.92      0.92      0.92       100

Checking Shape of Data
(400, 9)
(100, 9)
Model accuracy score: 0.8700
Classification Report is
              precision    recall  f1-score   support

           0       0.83      0.81      0.82        36
           1       0.93      0.93      0.93        15
           2       0.85      0.88      0.87        33
           3       0.94      0.94      0.94        16

    accuracy                           0.87       100
   macro avg       0.89      0.89      0.89       100
weighted avg       0.87      0.87      0.87      

# Classifier on Linguistic Data

In [27]:
scores = []
for i in np.arange(3,12):
    scores.append(ClassificationData(lingDirectionsAverage, lingDirectionsAverage, i))
    
fig = plt.figure(figsize = (16, 9))
plt.plot(np.arange(3,12),scores)
fig.show()


Checking Shape of Data
(400, 9)
(100, 9)
Model accuracy score: 0.8900
Classification Report is
              precision    recall  f1-score   support

           0       0.85      0.85      0.85        33
           1       0.97      0.84      0.90        38
           2       0.85      1.00      0.92        29

    accuracy                           0.89       100
   macro avg       0.89      0.90      0.89       100
weighted avg       0.90      0.89      0.89       100

Checking Shape of Data
(400, 9)
(100, 9)
Model accuracy score: 0.8800
Classification Report is
              precision    recall  f1-score   support

           0       1.00      0.78      0.88         9
           1       0.86      0.83      0.84        29
           2       0.96      0.93      0.95        29
           3       0.81      0.91      0.86        33

    accuracy                           0.88       100
   macro avg       0.91      0.86      0.88       100
weighted avg       0.89      0.88      0.88      

# Cross-classifying Analog and Linguistic Data

In [88]:
ClassificationData(lingDirectionsAverage, analogDirectionsAverage, 3,True)

Checking Shape of Data
(400, 9)
(100, 9)
Model accuracy score: 0.5700
Classification Report is
              precision    recall  f1-score   support

           0       0.61      0.65      0.63        43
           1       0.50      0.35      0.41        20
           2       0.55      0.59      0.57        37

    accuracy                           0.57       100
   macro avg       0.55      0.53      0.54       100
weighted avg       0.57      0.57      0.56       100





0.57