In [127]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from datetime import datetime, timedelta
from sklearn.model_selection import KFold
from factor_analyzer import factor_analyzer, FactorAnalyzer
from sklearn.decomposition import PCA
import import_ipynb
import SplitData as SD
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import pdfkit as pdf
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import validation_curve
from sklearn.linear_model import Ridge
import pickle
import torch
from torch import nn
from torch.autograd import Variable
from torch.utils.data.dataset import Dataset, TensorDataset
from sklearn.model_selection import train_test_split

# Load Data

In [128]:
inputs = {}
for file_name in os.listdir('./New_Data/'):
    if file_name.startswith('.'):
        continue
    name = file_name.replace('.csv', '')
    path = './New_Data/' + file_name
    inputs[name] = pd.read_csv(path, index_col = 0)

## Processing Data

In [129]:
SD.splitActivities(inputs)
SD.splitAudio(inputs)
SD.splitDark(inputs)
SD.splitConversation(inputs)

### Load Processed Data

In [130]:
inputs = {}
for file_name in os.listdir('./Final_Data/'):
    if file_name.startswith('.'):
        continue
    name = file_name.replace('.csv', '')
    path = './Final_Data/' + file_name
    df = pd.read_csv(path, index_col = 0)
    inputs[name]= df.replace(0, np.nan)
inputs['sms'] = inputs.pop('sms_spark')
inputs['call_log'] = inputs.pop('call_log_spark2')

In [131]:
def fillMissing(df):
    df = df.transpose().interpolate(method='linear').transpose()
    return df.transpose().interpolate(method = 'linear', limit_direction='backward').transpose()

In [132]:
for key in inputs.keys():
    if inputs[key].shape[1] != 1:
        inputs[key] = fillMissing(inputs[key])

### Split into positive negative and flourishing score dataframes

In [137]:
flourishing = pd.read_csv('./StudentLife_Dataset/Outputs/FlourishingScale.csv')
panas = pd.read_csv('./StudentLife_Dataset/Outputs/panas.csv')
positive_score=['uid', 'Interested', 'Strong', 'Enthusiastic', 'Proud', 'Alert', 'Inspired', 'Determined ', 'Attentive', 'Active ']
negative_score=['uid', 'Distressed', 'Upset', 'Guilty', 'Scared', 'Hostile ', 'Irritable','Nervous', 'Jittery', 'Afraid ']
df_flour_post = pd.DataFrame()
df_pos_post = pd.DataFrame()
df_neg_post = pd.DataFrame()
   
for i in range(60):
    temp_flour_post = (flourishing.loc[flourishing['uid'] == 'u' + str(f"{i:02d}")].loc[flourishing['type'] == 'post']).drop(columns='type')
    df_flour_post = pd.concat([df_flour_post, temp_flour_post], axis = 0)
    
    temp_post = panas.loc[panas['uid'] == 'u' + str(f"{i:02d}")].loc[panas['type'] == 'post']
    df1_post = temp_post[positive_score]
    df2_post = temp_post[negative_score]
    df_pos_post = pd.concat([df_pos_post, df1_post], axis=0)
    df_neg_post = pd.concat([df_neg_post, df2_post], axis=0)
df_flour_post = df_flour_post.set_index(keys='uid')
df_pos_post = df_pos_post.set_index(keys='uid')
df_neg_post = df_neg_post.set_index(keys = 'uid')
df_flour_post = df_flour_post.dropna()
df_pos_post = df_pos_post.dropna()
df_neg_post = df_neg_post.dropna()

### INPUT DATA for Flourishing Data###

In [8]:
## Look for people quit for post flourishing score testing ##
full_ids = []
for i in range(60):
    full_ids.append('u' + str(f"{i:02d}"))
ids_flour_post = df_flour_post.index.to_numpy()
quit_ids = list(set(full_ids) - set(ids_flour_post))
#print(quit_ids)
### Delete people quit, from dataframe ###
input_keys = inputs.keys()
flour_input = {}
for key in input_keys:
    flour_input[key] = inputs[key].drop(quit_ids, errors='ignore')

In [126]:
# ### 3 dimensional Data with (number of participants * number of weeks * number of features) ###
# nWeeks = 10
# input_keys = ['walk','run', 'noise', 'conversation_freq', 'conversation_time', 'dark_freq', 'dark_time']
# n_features = len(input_keys)
# data_3d = np.zeros((len(ids_flour_post), nWeeks, n_features))
# for nWeek in range(10):
#     n = 0
#     for key in input_keys:
#         data_3d[:, nWeek , n] = flour_input[key].iloc[:, nWeek]
#         n += 1
        
        
# file = open('data_y.pickle', 'wb')
# pickle.dump(np_df, file)
# file.close()

# file2 = open('data_x.pickle', 'wb')
# pickle.dump(data_3d, file2)
# file2.close()


83

# METHODS #

### BINARIZATION ###

In [12]:
### Convert score to binary data ###
def binarize(df, threshold):
    m = threshold
    if m < 1:
        df[df.iloc[:, 0] > m] = 1
        df[df.iloc[:, 0] <= m] = 0
    else:
        df[df.iloc[:, 0] <= m] = 0
        df[df.iloc[:, 0] > m] = 1
    return df

## Method 1

### LSTM 

In [None]:
basic = os.getcwd()
save_path = basic + "/bilstm_params.pkl"
data_path = basic + "/data_x.pickle"
label_path = basic + "/data_y.pickle"

In [None]:
data = pickle.load(open(data_path, "rb"))
label = pickle.load(open(label_path, "rb")).reshape(37)

In [None]:
# Hyper Parameters
EPOCH = 10               # train the training data n times
BATCH_SIZE = 8
HIDDEN_SIZE = 16
TIME_STEP = data.shape[1]         # rnn time step / image height
INPUT_SIZE = data.shape[2]         # rnn input size / image width
LR = 0.01               # learning rate

In [None]:
train_x, valid_x, train_y, valid_y = train_test_split(data, label, train_size=0.8, random_state=100)

In [None]:
torch.manual_seed(999)    # reproducible

In [None]:
# Data Loader for easy mini-batch return in training
train_x, train_y = torch.from_numpy(train_x), torch.from_numpy(train_y)
valid_x, valid_y = torch.from_numpy(valid_x), torch.from_numpy(valid_y)
train_loader = torch.utils.data.DataLoader(dataset=TensorDataset(train_x, train_y), 
                                           batch_size=BATCH_SIZE, shuffle=True)

In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class BiRNN(nn.Module):
    
    def __init__(self, hidden_size=HIDDEN_SIZE, num_layers=1, num_classes=2):
        super(BiRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(INPUT_SIZE, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection
        self.hidden = None
    def forward(self, x):
        # Set initial states
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) # 2 for bidirection 
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        
        # Forward propagate LSTM
        out, self.hidden = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size*2)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

In [None]:
rnn = BiRNN()
print(rnn)

In [None]:
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all rnn parameters
loss_func = nn.CrossEntropyLoss()

In [None]:
train_losses = []
valid_losses = []
train_accuracy = []
valid_accuracy = []
f = plt.figure(figsize=(12, 5))
ax1 = f.add_subplot(121)
ax2 = f.add_subplot(122)

# training and testing
for epoch in range(EPOCH):
    for step, (x, y) in enumerate(train_loader):        # gives batch data
        b_x = Variable(x.float())                       # batch x
        b_y = Variable(y.long())                        # batch y
        output = rnn(b_x)                            # rnn output
        loss = loss_func(output, b_y)                   # cross entropy loss
        optimizer.zero_grad()                           # clear gradients for this training step
        loss.backward()                                 # backpropagation, compute gradients
        optimizer.step()                                # apply gradients
        if step % 5 == 0:
            valid_output = rnn(Variable(valid_x.float()))
            train_output = rnn(Variable(train_x.float()))
            train_loss = loss_func(train_output, train_y.long())
            valid_loss = loss_func(valid_output, valid_y.long())
            train_losses.append(train_loss.data)
            valid_losses.append(valid_loss.data)
            
            pred_train = torch.max(train_output, 1)[1].data.numpy().squeeze()
            pred_valid = torch.max(valid_output, 1)[1].data.numpy().squeeze()
            train_accu = sum(pred_train == train_y.data.numpy()) / float(train_y.numpy().size)
            valid_accu = sum(pred_valid == valid_y.data.numpy()) / float(valid_y.numpy().size)
            train_accuracy.append(train_accu)
            valid_accuracy.append(valid_accu)
            
            print('Epoch: ', epoch, '| train loss: %.4f' % loss.data, '| valid loss: %.4f' % valid_loss.data, 
                  '| train accuracy: %.4f' % train_accu, '| valid accuracy: %.4f' % valid_accu)

ax1.plot(train_losses, label="train loss")
ax1.plot(valid_losses, label="valid loss")
ax1.legend()
ax2.plot(train_accuracy, label="train acc")
ax2.plot(valid_accuracy, label="valid acc")
ax2.legend()

## Method 2


### Get Hidden Gate from LSTM

In [None]:
def getHidden():
    rnn_knn = BiRNN()
    optimizer = torch.optim.Adam(rnn_knn.parameters(), lr=LR)   # optimize all rnn parameters
    loss_func = nn.CrossEntropyLoss()
    train_losses = []
    valid_losses = []
    train_accuracy = []
    valid_accuracy = []
    # training and testing
    for epoch in range(EPOCH):
        data1 = torch.tensor(data)
        y1 = torch.tensor(label)
        b_x = Variable(data1.float())                       # batch x
        b_y = Variable(y1.long())                        # batch y
        output = rnn_knn(b_x)                            # rnn output
        loss = loss_func(output, b_y)                   # cross entropy loss
        optimizer.zero_grad()                           # clear gradients for this training step
        loss.backward()                                 # backpropagation, compute gradients
    data_week10 = rnn_knn.hidden[1][0]
    data_week10 = data_week10.detach().numpy()
    return data_week10

### Input and Output

In [13]:
## Look for people quit for post flourishing score testing ##
full_ids = []
for i in range(60):
    full_ids.append('u' + str(f"{i:02d}"))
ids_flour_post = df_flour_post.index.to_numpy()
quit_ids = list(set(full_ids) - set(ids_flour_post))

### Delete people quit, from dataframe ###
input_keys = inputs.keys()
flour_input = {}
for key in input_keys:
    flour_input[key] = inputs[key].drop(quit_ids, errors='ignore')
for key in flour_input.keys():
    flour_input[key] = flour_input[key].sum(axis=1).to_frame()
X_flour = pd.DataFrame(columns=None)

## input and label for flourishing score
for key in flour_input:
    flour_input[key].columns = [key]
    X_flour = pd.concat([X_flour, flour_input[key]], axis = 1)
label_flour = binarize(df_flour_post.sum(axis = 1).to_frame(0), 44)

In [14]:
## Look for people quit for post flourishing score testing ##
full_ids = []
for i in range(60):
    full_ids.append('u' + str(f"{i:02d}"))
ids_pos_post = df_pos_post.index.to_numpy()
quit_ids = list(set(full_ids) - set(ids_pos_post))

### Delete people quit, from dataframe ###
input_keys = inputs.keys()
pos_input = {}
for key in input_keys:
    pos_input[key] = inputs[key].drop(quit_ids, errors='ignore')
for key in pos_input.keys():
    pos_input[key] = pos_input[key].sum(axis=1).to_frame()
X_pos = pd.DataFrame(columns=None)

## input and label for flourishing score
for key in pos_input:
    pos_input[key].columns = [key]
    X_pos = pd.concat([X_pos, pos_input[key]], axis = 1)
label_pos = binarize(df_pos_post.sum(axis = 1).to_frame(0), 29)

In [15]:
## Look for people quit for post flourishing score testing ##
full_ids = []
for i in range(60):
    full_ids.append('u' + str(f"{i:02d}"))
ids_neg_post = df_neg_post.index.to_numpy()
quit_ids = list(set(full_ids) - set(ids_neg_post))

### Delete people quit, from dataframe ###
input_keys = inputs.keys()
neg_input = {}
for key in input_keys:
    neg_input[key] = inputs[key].drop(quit_ids, errors='ignore')
for key in pos_input.keys():
    neg_input[key] = neg_input[key].sum(axis=1).to_frame()
X_neg = pd.DataFrame(columns=None)

## input and label for flourishing score
for key in neg_input:
    neg_input[key].columns = [key]
    X_neg = pd.concat([X_neg, neg_input[key]], axis = 1)
label_neg = binarize(df_neg_post.sum(axis = 1).to_frame(0), 16)

### KNN

In [16]:
def KNN(features, y, n_neighbour):
    train_score = pd.DataFrame()
    valid_score = pd.DataFrame()
    kf = KFold(n_splits=5, shuffle=True)
    for train, test in kf.split(features):
        X_train, X_test, y_train, y_test = features[train, :], features[test, :], y[train], y[test]
        neighbour = KNeighborsClassifier(n_neighbors=n_neighbour).fit(X_train, y_train)
        pred_test = neighbour.predict(X_test)
        pred_train = neighbour.predict(X_train)
        auc_train = 0
        acc_train = 0
        precision_train = 0
        recall_train = 0
        fscore_train = 0
        auc_test = 0
        acc_test = 0
        precision_test = 0
        recall_test = 0
        fscore_test = 0
        try:
            auc_test = roc_auc_score(y_test, pred_test)
            acc_test = accuracy_score(y_test, pred_test)
            precision_test, recall_test, fscore_test, _ = precision_recall_fscore_support(y_test, pred_test, average='weighted')
            auc_train = roc_auc_score(y_train,  pred_train)
            acc_train = accuracy_score(y_train, pred_train)
            precision_train, recall_train, fscore_train, _ = precision_recall_fscore_support(y_train, pred_train, average='weighted')
        except ValueError:
            print('error')
            continue
            # _, _, _, _, roc_auc = KNN(features, y, n_neighbour)
            # acc, _, _, _, _ = KNN(features, y, n_neighbour)
            # _, precision, recall, fscore, _ =KNN(features, y, n_neighbour)
        score_train = np.array([acc_train, precision_train, recall_train, fscore_train, auc_train])
        score_test = np.array([acc_test, precision_test, recall_test, fscore_test, auc_test])    
    #temp = pd.DataFrame(data = {'Accuracy': acc, 'Precision': precision, 'Recall': recall, 'F1': fscore, 'ROC_AUC': roc_auc})
        score_train = pd.DataFrame(score_train.reshape(-1, len(score_train)), columns=['accuracy', 'precision', 'recall', 'fscore', 'roc_auc'])
        score_test = pd.DataFrame(score_test.reshape(-1, len(score_test)), columns=['accuracy', 'precision', 'recall', 'fscore', 'roc_auc'])
        valid_score = pd.concat([valid_score, score_test], axis=0)
        train_score = pd.concat([train_score, score_train], axis=0)
    return train_score, valid_score

### Optimisation

In [None]:
### ROC and AUC to find the optimal k ###
def roc_auc_comparison(features, y):
    n = 4
    kf = KFold(n_splits=n, shuffle=True)
    scores = []
    for i in range(2,10):
        score = 0
        n = 0
        for train, test in kf.split(features):
            X_train, X_test, y_train, y_test = features[train, :], features[test, :], y[train], y[test]
            model = KNeighborsClassifier(n_neighbors=i).fit(X_train, y_train)
            pred = model.predict_proba(X_test)[:,1]
            try:
                score += roc_auc_score(y_test, pred)
                n += 1
            except ValueError:
                pass
            
        scores.append(score/n)
    n_neighbour = np.asarray(scores).argmax()+2
    plt.plot(range(2,10),scores,label='AUC_score',color='grey')
    plt.xlabel('number of neighbours')
    plt.ylabel('AUC score')
    return n_neighbour

### Feature Selection

In [None]:
def KNN_feature_selection(X, label, name):
    count = [0] *len(X.columns)
    for i in range(100):
        model = RandomForestClassifier()
        rfe = RFE(model)
        rfe = rfe.fit(X, label)
        a = rfe.support_ 

        X_new = rfe.transform(X)
        X_new = pd.DataFrame(X_new, index = X.index, columns=X.columns[a])
        n = roc_auc_comparison(features=X_new.to_numpy(), y=label.to_numpy())
        acc, _, _, _, _ = KNN(X_new, y=label, n_neighbour=n)
        if acc <= 0.5:
            continue
        for j in range(len(rfe.ranking_)):
            if rfe.ranking_[j] == 1:
                count[j] += 1
    plt.show()
    temp = pd.DataFrame(count, index = X.columns, columns=['freq']).sort_values(by = ['freq'], ascending=False)
    plt.barh(temp.index, temp.iloc[:,0])
    title = 'Feature Importance for ' + name + ' score in KNN'
    plt.title(title)
    path = './Images/knn_importance_' + name + '.png'
    plt.xlabel('Frequency')
    plt.savefig(path, bbox_inches='tight')
    plt.show()
    return np.array(count).argsort()[-5:]

#### Flourishing score

In [123]:
chosen = ['conversation_time', 'noise', 'run', 'walk', 'social']
#chosen = ['conversation_time', 'noise', 'run']
X_flour_chosen = X_flour.loc[:, chosen]

In [124]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
#gnb.fit(X_train, y_train)
for i in range(20):
    kf = KFold(n_splits=4, shuffle = True)
    print('iteration', i)
    for train, test in kf.split(X_flour_chosen):
        X = X_flour_chosen.to_numpy()
        y = label_flour.to_numpy()
        X_train, X_test, y_train, y_test = X[train, :], X[test, :], y[train], y[test]
        gnb = gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
        y_train_pred = gnb.predict(X_train)
        print('train')
        print(accuracy_score(y_train, y_train_pred))
        print('test')
        print(accuracy_score(y_test, y_pred))

iteration 0
train
0.8148148148148148
test
0.2
train
0.6428571428571429
test
0.7777777777777778
train
0.6785714285714286
test
0.5555555555555556
train
0.6428571428571429
test
0.2222222222222222
iteration 1
train
0.7407407407407407
test
0.5
train
0.5714285714285714
test
0.4444444444444444
train
0.6428571428571429
test
0.5555555555555556
train
0.6071428571428571
test
0.4444444444444444
iteration 2
train
0.7037037037037037
test
0.3
train
0.6071428571428571
test
0.4444444444444444
train
0.5357142857142857
test
0.6666666666666666
train
0.6071428571428571
test
0.6666666666666666
iteration 3
train
0.7037037037037037
test
0.5
train
0.6428571428571429
test
0.5555555555555556
train
0.6785714285714286
test
0.5555555555555556
train
0.7857142857142857
test
0.2222222222222222
iteration 4
train
0.7407407407407407
test
0.5
train
0.6785714285714286
test
0.4444444444444444
train
0.6785714285714286
test
0.4444444444444444
train
0.6785714285714286
test
0.3333333333333333
iteration 5
train
0.629629629629629

#### Positive Score

In [109]:
chosen = ['conversation_time', 'conversation_freq', 'noise']
#chosen = ['conversation_time', 'conversation_freq']
X_pos_chosen = X_pos.loc[:, chosen]

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
#gnb.fit(X_train, y_train)
for i in range(20):
    kf = KFold(n_splits=4, shuffle = True)
    print('iteration', i)
    for train, test in kf.split(X_pos_chosen):
        X = X_pos_chosen.to_numpy()
        y = label_pos.to_numpy()
        X_train, X_test, y_train, y_test = X[train, :], X[test, :], y[train], y[test]
        gnb = gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
        y_train_pred = gnb.predict(X_train)
        print('train')
        print(accuracy_score(y_train, y_train_pred))
        print('test')
        print(accuracy_score(y_test, y_pred))

In [121]:
chosen = ['dark_time', 'noise', 'dark_freq', 'run', 'sms']
X_neg_chosen = X_neg.loc[:, chosen]

In [122]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
#gnb.fit(X_train, y_train)
for i in range(20):
    kf = KFold(n_splits=4, shuffle = True)
    print('iteration', i)
    for train, test in kf.split(X_neg_chosen):
        X = X_neg_chosen.to_numpy()
        y = label_neg.to_numpy()
        X_train, X_test, y_train, y_test = X[train, :], X[test, :], y[train], y[test]
        gnb = gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
        y_train_pred = gnb.predict(X_train)
        print('train')
        print(accuracy_score(y_train, y_train_pred))
        print('test')
        print(accuracy_score(y_test, y_pred))

iteration 0
train
0.6896551724137931
test
0.5
train
0.7586206896551724
test
0.8
train
0.7586206896551724
test
0.7
train
0.8666666666666667
test
0.4444444444444444
iteration 1
train
0.7586206896551724
test
0.8
train
0.7931034482758621
test
0.8
train
0.8620689655172413
test
0.6
train
0.7
test
0.5555555555555556
iteration 2
train
0.7586206896551724
test
0.7
train
0.7241379310344828
test
0.8
train
0.7931034482758621
test
0.6
train
0.8
test
0.5555555555555556
iteration 3
train
0.7931034482758621
test
0.7
train
0.7931034482758621
test
0.7
train
0.7241379310344828
test
0.5
train
0.7666666666666667
test
0.7777777777777778
iteration 4
train
0.7931034482758621
test
0.6
train
0.7931034482758621
test
0.6
train
0.7931034482758621
test
0.7
train
0.7666666666666667
test
0.7777777777777778
iteration 5
train
0.7241379310344828
test
0.9
train
0.7586206896551724
test
0.3
train
0.7931034482758621
test
0.7
train
0.7666666666666667
test
0.7777777777777778
iteration 6
train
0.7586206896551724
test
0.8
train


### Evaluation

#### Flourishing Score

In [None]:
score = pd.DataFrame(columns = None)
optimalK = []
for i in range(100):
    n = roc_auc_comparison(X_flour_chosen.to_numpy(), label_flour.to_numpy())
    optimalK.append(n)
    li = KNN(X_flour_chosen.to_numpy(), label_flour.to_numpy(), n)
    score = pd.concat([score, pd.DataFrame(np.array(li))], axis=1)
plt.title('Find Optimal K for KNN for predicting Flourishing Score')
plt.savefig('./Images/k_flour.png', bbox_inches='tight')
plt.show()

temp = np.unique(np.array(optimalK),return_counts=True)
index = temp[0]
freq = temp[1]
#index, freq
plt.barh(index , freq)
plt.title('Frequency of each K selected to be an optimal for Flourishing Score in KNN')
plt.ylabel('K')
plt.xlabel('frequency')
path = './Images/knn_optimalK_four.png'
plt.savefig(path, bbox_inches='tight')
plt.show()
score.index = ['acc', 'precision', 'recall', 'fscore', 'roc_auc']

In [None]:
temp = score.mean(axis = 1).to_frame()
temp.columns = ['Score for Flourishing']
temp

#### Positive Score

In [None]:
score = pd.DataFrame(columns = None)
optimalK = []
for i in range(100):
    n = roc_auc_comparison(X_pos_chosen.to_numpy(), label_pos.to_numpy())
    optimalK.append(n)
    li = KNN(X_pos_chosen.to_numpy(), label_pos.to_numpy(), n)
    score = pd.concat([score, pd.DataFrame(np.array(li))], axis=1)
plt.title('Find Optimal K for KNN for predicting Positive Score')
plt.savefig('./Images/k_positive.png', bbox_inches='tight')
plt.show()

temp = np.unique(np.array(optimalK),return_counts=True)
index = temp[0]
freq = temp[1]
#index, freq
plt.barh(index , freq)
plt.title('Frequency of each K selected to be an optimal for Positive Score in KNN')
plt.ylabel('K')
plt.xlabel('frequency')
path = './Images/knn_optimalK_pos.png'
plt.savefig(path, bbox_inches='tight')
plt.show()
score.index = ['acc', 'precision', 'recall', 'fscore', 'roc_auc']

In [None]:
temp = score.mean(axis = 1).to_frame()
temp.columns = ['Score for PANAS Positive']
temp

#### Negative Score

In [None]:
score = pd.DataFrame(columns = None)
optimalK = []
for i in range(100):
    n = roc_auc_comparison(X_neg_chosen.to_numpy(), label_neg.to_numpy())
    optimalK.append(n)
    li = KNN(X_neg_chosen.to_numpy(), label_neg.to_numpy(), n)
    score = pd.concat([score, pd.DataFrame(np.array(li))], axis=1)
    
    
plt.title('Find Optimal K for KNN for predicting Negative Score')
plt.savefig('./Images/k_negative.png', bbox_inches='tight')
plt.show()


temp = np.unique(np.array(optimalK),return_counts=True)
index = temp[0]
freq = temp[1]
#index, freq
plt.barh(index , freq)
plt.title('Frequency of each K selected to be an optimal for Negative Score in KNN')
plt.ylabel('K')
plt.xlabel('frequency')
path = './Images/knn_optimalK_neg.png'
plt.savefig(path, bbox_inches='tight')
plt.show()
score.index = ['acc', 'precision', 'recall', 'fscore', 'roc_auc']

In [None]:
temp = score.mean(axis = 1).to_frame()
temp.columns = ['Score for PANAS Negative']
temp

## Method 3

### Random Forest 

### Optimisation

In [None]:
def random_forest_validation(X, y, name, rang, score_type):
    train_score, validation_score = validation_curve(RandomForestClassifier(), X, y.to_numpy().ravel(), param_name=name,param_range=rang, scoring="accuracy", cv=3)
    sum_score = validation_score.sum(axis = 1)
    # Calculate mean and standard deviation for training set scores
    train_mean = np.mean(train_score, axis=1)
    train_std = np.std(train_score, axis=1)

    # Calculate mean and standard deviation for test set scores
    validation_mean = np.mean(validation_score, axis=1)
    validation_std = np.std(validation_score, axis=1)

    # Plot mean accuracy scores for training and test sets
    plt.plot(rang, train_mean, label="Training score", color="red")
    plt.plot(rang, validation_mean, label="Cross-validation score", color="blue")

    # Plot accurancy bands for training and test sets
    plt.fill_between(rang, train_mean - train_std, train_mean + train_std, color="pink")
    plt.fill_between(rang, validation_mean - validation_std, validation_mean + validation_std, color="lightblue")
    n = sum_score.argmax()
    print(sum_score[n])
    
    # Create plot
    title = "Validation Curve With Random Forest for " + score_type
    plt.title(title)
    plt.xlabel(name)
    plt.ylabel("Accuracy Score")
    plt.tight_layout()
    plt.legend(loc="best")
    path = './Images/' + name + '_' + score_type + '.png'
    plt.savefig(path, bbox_inches='tight')
    plt.show()
    return rang[n]

In [None]:
def optimiseRandomForest(X, label, score_type):
    rang = np.arange(1, 100, 2)
    name = "n_estimators"
    n_estimators = random_forest_validation(X, label, name, rang, score_type)
    rang = np.arange(2, 10, 1)
    name = "max_depth"
    max_depth = random_forest_validation(X, label, name, rang, score_type)
    rang = np.arange(2, 10, 1)
    name = "min_samples_split"
    min_samples_split = random_forest_validation(X, label,name, rang, score_type)
    rang = np.arange(2, 10, 1)
    name = "min_samples_leaf"
    min_samples_leaf = random_forest_validation(X, label,name, rang, score_type)
    return n_estimators, max_depth, min_samples_split, min_samples_leaf

#### Flourishing Score

In [None]:
n_estimators, max_depth, min_samples_split, min_samples_leaf = optimiseRandomForest(X_flour, label_flour, 'flour')
X_flour_train, X_flour_valid, label_flour_train, label_flour_valid =  train_test_split(X_flour, label_flour, train_size=0.8)
print(n_estimators, max_depth, min_samples_split, min_samples_leaf)
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf = min_samples_leaf)
clf.fit(X_flour_train, label_flour_train)
importance = clf.feature_importances_
importance = pd.DataFrame(importance, index = X_flour.columns.to_numpy())
importance.columns = ['importance']
importance = importance.sort_values(by = ['importance'], ascending=False)
plt.barh(importance.index, importance.iloc[:, 0])
plt.title('Feature Importance for Flourishing Score')
plt.xlabel('importance rate')
plt.savefig('./Images/random_importance_flour.png', bbox_inches='tight')
plt.show()

In [None]:
pred = clf.predict(X_flour_valid)
acc = accuracy_score(label_flour_valid, pred)
precision, recall, fscore, _ = precision_recall_fscore_support(label_flour_valid, pred, average='weighted')
auc = roc_auc_score(label_flour_valid, pred)

In [None]:
pd.DataFrame(np.array([acc, precision, recall, fscore, auc]), index = ['accuracy', 'precision', 'recall', 'fscore', 'auc'], columns = ['Scores for Flourishing'])

#### Positive Score

In [None]:
### Optimisation ###
X_pos_train, X_pos_valid, label_pos_train, label_pos_valid =  train_test_split(X_pos, label_pos, train_size=0.8)
n_estimators, max_depth, min_samples_split, min_samples_leaf = optimiseRandomForest(X_pos, label_pos, 'positive')

print(n_estimators, max_depth, min_samples_split, min_samples_leaf)
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf = min_samples_leaf, random_state=0)
clf.fit(X_pos_train, label_pos_train)
importance = clf.feature_importances_
importance = pd.DataFrame(importance, index = X_pos_train.columns.to_numpy())
importance.columns = ['importance']
importance = importance.sort_values(by = ['importance'], ascending=False)
plt.barh(importance.index, importance.iloc[:, 0])
plt.title('Feature Importance for Positive Score')
plt.xlabel('importance rate')
plt.savefig('./Images/random_importance_positive.png', bbox_inches='tight')
plt.show()

In [None]:
pred = clf.predict(X_pos_valid)
acc = accuracy_score(label_pos_valid, pred)
precision, recall, fscore, _ = precision_recall_fscore_support(label_pos_valid, pred, average='weighted')
auc = roc_auc_score(label_pos_valid, pred)

In [None]:
pd.DataFrame(np.array([acc, precision, recall, fscore, auc]), index = ['accuracy', 'precision', 'recall', 'fscore', 'auc'], columns = ['Scores for Positive'])

#### Negative Scores

In [None]:
### Optimisation ###
X_neg_train, X_neg_valid, label_neg_train, label_neg_valid =  train_test_split(X_neg, label_neg, train_size=0.8)
n_estimators, max_depth, min_samples_split, min_samples_leaf = optimiseRandomForest(X_neg, label_neg, 'negative')

print(n_estimators, max_depth, min_samples_split, min_samples_leaf)
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf = min_samples_leaf, random_state=0)
smt = SMOTE()
X_neg_train, label_neg_train = smt.fit_sample(X_neg_train, label_neg_train)

clf.fit(X_neg_train, label_neg_train)
importance = clf.feature_importances_
importance = pd.DataFrame(importance, index = X_neg.columns.to_numpy())
importance.columns = ['importance']
importance = importance.sort_values(by = ['importance'], ascending=False)
plt.barh(importance.index, importance.iloc[:, 0])
plt.title('Feature Importance for Negative Score')
plt.savefig('./Images/random_importance_negative.png', bbox_inches='tight')
plt.show()

In [None]:
pred = clf.predict(X_neg_valid)
acc = accuracy_score(label_neg_valid, pred)
precision, recall, fscore, _ = precision_recall_fscore_support(label_neg_valid, pred, average='weighted')
auc = roc_auc_score(label_neg_valid, pred)

In [None]:
pd.DataFrame(np.array([acc, precision, recall, fscore, auc]), index = ['accuracy', 'precision', 'recall', 'fscore', 'auc'], columns = ['Scores for Negative'])

## Correlation 

### Flourishing Score

In [None]:
pd.concat([pd.DataFrame(X_flour['dark_time']), label_flour], axis = 1).corr()

In [None]:
pd.concat([pd.DataFrame(X_flour['conversation_time']), label_flour], axis = 1).corr()

In [None]:
pd.concat([pd.DataFrame(X_flour['call_log']), label_flour], axis = 1).corr()

In [None]:
pd.concat([pd.DataFrame(X_flour['noise']), label_flour], axis = 1).corr()

### Negative Score

In [None]:
pd.concat([pd.DataFrame(X_neg['dark_freq']), label_neg], axis = 1).corr()

In [None]:
pd.concat([pd.DataFrame(X_neg['dark_time']), label_neg], axis = 1).corr()

In [None]:
pd.concat([pd.DataFrame(X_neg['conversation_time']), label_neg], axis = 1).corr()

In [None]:
pd.concat([pd.DataFrame(X_neg['walk']), label_neg], axis = 1).corr()

### positive score

In [None]:
for key in X_pos.keys():
    a = pd.concat([pd.DataFrame(X_pos[key]), label_pos], axis = 1).corr()
    path = './Images/' + key+'.html'
    a.to_html(path)