In [None]:
from sklearn.metrics.pairwise import cosine_similarity


"""

paper 2 - ALTERNATIVE SIGNATURE-BASED ('VECTOR SPACE') APPROACH

Clustering GP practices according to similarities in signature paths


Given data are available up to mid December 2017, models are trained to predict feedback for September 
and October 2017. Model testing takes place for feedback in November and December 2017. Models are trained and
tested for GP practices which:
    - have at least 2 reviews from period preceding September 2017
    - received some feedback in November and December 2017

"""

def cluster_pred(csv_file, depvar='q9', ignore_nan=True, train = None, test = None, day_min=735008,log=False,degs=2, sigs_with_depvar=True, cutoff = 10):
    """
    
    To do:
        - can add options for different distance measures between pairs of vectors
        
    """
    
    #preliminary tests
    if test == None and test == train:
        test = list(range(736621,736681))
        train = list(range(day_min,736621))
    elif type(test) == list and type(train) == list:
        if len(test) == 0 or len(train) == 0:
            print('TypeError: test and train should be non-empty lists of day ID numbers')
            return()
    else:
        print("TypeError: define test and train attributes or leave default values, i.e. None")
        return()
    
    # test for overlap of train_range and test_range
    if min(test) <= max(train):
        print("AttributeError: select non-overlapping ranges of days for test and train. Also, days included in train should pre-date days included in test")
        return()
    
    # test whether day_min is prior to day ranges defined in y_train and y_test
    if day_min >= min(test) or day_min > min(train):
        print("AttributeError: select ranges of days for test and train which post-date day_min")
        return()
    
    window = max(test) - min(test) + 1
    
    
    # generate data paths and signatures for each GP practice - "train" period. Retain only data points where dependent variable data are available
    x_train = make_paths(csv_file, depvar=depvar, min_day=day_min, max_day=max(train)-window, ignore_nan = ignore_nan)
    x_train = make_sigs(x_train, degs, 'sigs_xtrain.csv', log=log, include_depvar=sigs_with_depvar)
    y_train = make_depvar(csv_file,depvar, list(range(max(train)-window+1,max(train)+1)))

    y_train = pd.DataFrame(y_train)
    y_tmp = [str(x) for x in list(y_train.columns)]
    x_tmp = [str(x) for x in list(x_train.columns)]
    y_train.columns = y_tmp
    x_train.columns = x_tmp
    train_summary = pd.merge(left=y_train,right=x_train, left_on='0', right_on='0', how='inner')
    train_summary.rename(columns={'1_x': 'y_train', '1_y': '1'}, inplace=True)
    
    # generate data paths and signatures for each GP practice - "test" period. Retain only data points where dependent variable data are available
    x_test = make_paths(csv_file, depvar=depvar, min_day=day_min+window, max_day=min(test)-1, ignore_nan = ignore_nan)
    x_test = make_sigs(x_test, degs, 'sigs_xtest.csv', log=log, include_depvar=sigs_with_depvar)
    y_test = make_depvar(csv_file,depvar, list(range(min(test),max(test)+1)))
    y_test = pd.DataFrame(y_test)
    y_tmp = [str(x) for x in list(y_test.columns)]
    x_tmp = [str(x) for x in list(x_test.columns)]
    y_test.columns = y_tmp
    x_test.columns = x_tmp
    test_summary = pd.merge(left=y_test,right=x_test, left_on='0', right_on='0', how='inner')
    test_summary.rename(columns={'1_x': 'y_test', '1_y': '1'}, inplace=True)
    
    # predict train dates (with variable "n" parameter which corresponds to the number of nearest vectors considered)
    train_summary = train_summary.values.tolist()
    # compute cosine similarities for each GP practice
    x = []
    y = []
    depvar_vals = []
    for i,v in enumerate(train_summary):
        if i < 10000000000: # sample size can be modified for exercise
            x.append(v[2:])
            y.append(v[2:])
            depvar_vals.append(int(v[1])) #depvar values get rounded to the nearest integer
    r = cosine_similarity(np.array(x), np.array(y)) #computes cosine similarity score of every vector in 'y' to each vector in 'x'
    
    # associate predictions with gp_ids and depvars rounded to nearest integer
    probs = {}
    for gp, row in enumerate(r): #every 'row' contains similarities of all vectors to a given vector of 'x' 
        i = []
        ii = []
        iii = []
        iv = []
        v = []
        for ind, element in enumerate(row):
            if depvar_vals[ind] == 1 and gp != ind:
                i.append(element)
            elif depvar_vals[ind] == 2 and gp != ind:
                ii.append(element)
            elif depvar_vals[ind] == 3 and gp != ind:
                iii.append(element)
            elif depvar_vals[ind] == 4 and gp != ind:
                iv.append(element)
            elif depvar_vals[ind] == 5 and gp != ind:
                v.append(element)
            elif gp == ind:
                pass
            else:
                print('something went wrong with prediction calculations')
        
        i = sorted(i, reverse=True)[:cutoff]
        i = [xx for xx in i if xx > 0]
        ii = sorted(ii, reverse=True)[:cutoff]
        ii = [xx for xx in ii if xx > 0]
        iii = sorted(iii, reverse=True)[:cutoff]
        iii = [xx for xx in iii if xx > 0]
        iv = sorted(iv, reverse=True)[:cutoff]
        iv = [xx for xx in iv if xx > 0]
        v = sorted(v, reverse=True)[:cutoff]
        v = [xx for xx in v if xx > 0]
        overall = i[:]
        overall.extend(ii)
        overall.extend(iii)
        overall.extend(iv)
        overall.extend(v)
        overall = sum(overall)

        tmp = []
        for xx in [i,ii,iii,iv,v]:
            prob = sum(xx) / overall
            tmp.append(prob)
        probs[train_summary[gp][0]] = tmp
    
    #populate the train object to return the predictions and actual values
    train = []
    for i, xx in enumerate(probs):
        tmp = [str(int(xx))]
        tmp.extend(probs[xx])
        tmp.extend([probs[xx].index(max(probs[xx]))+1 , int(train_summary[i][1])])
        train.append(tmp)
    train = pd.DataFrame(train, columns=['gp_id', "prob1", "prob2", "prob3", "prob4", "prob5", "yhat_train", "y_train"])
    

    # predict values for the test period (last 60 days of feedback) by looking at the similarity between pairs of signatures
    # do it the bayesian way (select the highest probability result, and then calculate test MSE error)
    
    test_summary = test_summary.values.tolist()
    x = []
#     test_vals = []
    for i,v in enumerate(test_summary):
        if i < 10000000000: # sample size can be modified for exercise
            x.append(v[2:])
#             test_vals.append(int(v[1])) #depvar values get rounded to the nearest integer

    r = cosine_similarity(np.array(x), np.array(y)) #computes cosine similarity score of every vector in 'y' to each vector in 'x'

#     associate predictions with gp_ids and depvars rounded to nearest integer
    probs = {}
    for gp, row in enumerate(r): #every 'row' contains similarities of all vectors to a given vector of 'x' 
        i = []
        ii = []
        iii = []
        iv = []
        v = []
        for ind, element in enumerate(row):
            if depvar_vals[ind] == 1:
                i.append(element)
            elif depvar_vals[ind] == 2:
                ii.append(element)
            elif depvar_vals[ind] == 3:
                iii.append(element)
            elif depvar_vals[ind] == 4:
                iv.append(element)
            elif depvar_vals[ind] == 5:
                v.append(element)
            else:
                print('something went wrong with prediction calculations')
        
        i = sorted(i, reverse=True)[:cutoff]
        i = [xx for xx in i if xx > 0]
        ii = sorted(ii, reverse=True)[:cutoff]
        ii = [xx for xx in ii if xx > 0]
        iii = sorted(iii, reverse=True)[:cutoff]
        iii = [xx for xx in iii if xx > 0]
        iv = sorted(iv, reverse=True)[:cutoff]
        iv = [xx for xx in iv if xx > 0]
        v = sorted(v, reverse=True)[:cutoff]
        v = [xx for xx in v if xx > 0]
        overall = i[:]
        overall.extend(ii)
        overall.extend(iii)
        overall.extend(iv)
        overall.extend(v)
        overall = sum(overall)
        
        tmp = []
        for xx in [i,ii,iii,iv,v]:
            prob = sum(xx) / overall
            tmp.append(prob)
        probs[test_summary[gp][0]] = tmp
    
    #populate the train object to return the predictions and actual values
    test = []
    for i, xx in enumerate(probs):
        tmp = [str(int(xx))]
        tmp.extend(probs[xx])
        tmp.extend([probs[xx].index(max(probs[xx]))+1 , int(test_summary[i][1])])
        test.append(tmp)
    test = pd.DataFrame(test, columns=['gp_id', "prob1", "prob2", "prob3", "prob4", "prob5", "yhat_test", "y_test"])
    
    results = [train,test]
    
    return results


# clus_res = cluster_pred('r_output.csv',depvar='q9', cutoff = 2)

print('ok')

In [None]:

def clus_mse_calculator (chosen_cutoffs):
    """
    
    Returns list of train and test MSE errors for predictions for a list of chosen 'cutoff' values.
    The 'cutoff' parameter is used in cluster_pred() function
    
    """
    clus_res_combo = []
    for c in chosen_cutoffs:
        print('compute results for cutoff=' +str(c))
        clus_res = cluster_pred('r_output.csv',depvar='q9', cutoff = c)
        train_mse = mean_squared_error(np.array(clus_res[0]['yhat_train']),np.array(clus_res[0]['y_train']))
        test_mse = mean_squared_error(np.array(clus_res[1]['yhat_test']),np.array(clus_res[1]['y_test']))
        clus_res_combo.append([c, train_mse, test_mse])
        print([c, train_mse, test_mse])

print('all candidate models completed')

# clus_res_combo = clus_mse_calculator(list(range(1,31)))
# clus_res_combo1001to1030 = clus_mse_calculator(list(range(1001,1031)))
# clus_res_combo101to130 = clus_mse_calculator(list(range(101,131)))

for x in clus_res_combo:
    print(x)

In [None]:
x = pd.DataFrame(t_results, columns = ['test_err','train_err'])
x['t_val'] = t_candidates
ns = [x[0] for x in clus_res_combo]
tr = [x[1] for x in clus_res_combo]
te = [x[2] for x in clus_res_combo]

print("Prediction errors in model training")
plt.plot(ns, tr, 'ro')
plt.xlabel('Number of top "n" models used to identify the most probable Likert-scale response')
plt.ylabel('average mean squared error')
plt.show()

print("Prediction errors in model testing")
plt.plot(ns, te, 'ro')
plt.xlabel('Number of top "n" models used to identify the most probable Likert-scale response')
plt.ylabel('average mean squared error')
plt.show()

In [None]:
# best clustering-based model

clus_res = cluster_pred('r_output.csv',depvar='q9', cutoff = 22)

In [None]:
mean_squared_error(np.array(clus_res[0]['yhat_train']),np.array(clus_res[0]['y_train']))