In [1]:
import os
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import xgboost as xgb
from sklearn.metrics import matthews_corrcoef as mcc
from scipy.stats.stats import pearsonr
from sklearn.decomposition import PCA
import sklearn.metrics as metrics
from sklearn.metrics import *
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, SVR
from sklearn.feature_selection import *
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold
from sklearn.neighbors import LocalOutlierFactor
from sklearn import preprocessing
import pickle
import warnings
warnings.simplefilter(action='ignore')
from imblearn.combine import SMOTETomek
OUT_DIR = "output/results/"
r = 6282019
is_clf = 0


filename='data/jordyn_sarah_lefamily_1000.pickle'
df = pd.read_pickle("data/jordyn_sarah.pkl")
y = df[["crawl_id", "snapshot_id"]]
y = y.drop_duplicates()
snap_crawl_dict = pd.Series(y.snapshot_id.values,index=y.crawl_id).to_dict()
def getSnapshotId(x):
    return snap_crawl_dict[x]

df_mitre = pd.read_csv("data/Mitre.csv")
df_mitre["created_date"] = pd.to_datetime(df_mitre.Date,format='%m/%d/%Y',errors='coerce').dt.date

In [3]:
filename='data/features1.pickle'
# filename='data/jordyn_sarah_lefamily.pickle'
filename='data/jordyn_sarah_SA.pickle' #this file has mitre merged already
# filename = 'data/7days_100.pkl'
with open(filename, 'rb') as f:
    df = pickle.load(f)
df = df.drop(["life_event_yes_no"],axis=1)
df['snapshot_id'] = df['crawl_id'].apply(lambda x : getSnapshotId(x))
# df = df.dropna(subset=['anxiety.d'])
# df=df[:100]
df.shape,df.columns

((882, 156),
 Index(['crawl_id', 'created_date', 'posts', 'positive_affect',
        'negative_affect', 'anger', 'anxiety', 'sadness', 'swear',
        'cognitive_mech',
        ...
        '94', '95', '96', '97', '98', '99', '100', 'snapshot_id', 'anxiety.d',
        'stress.d'],
       dtype='object', length=156))

In [4]:
df_labels = ["snapshot_id", "created_date","anxiety.d","stress.d"]
# df = pd.merge(df,df_mitre[df_labels],on=['snapshot_id','created_date'],how='inner')
df = df.dropna(subset=['anxiety.d'])
df = df.dropna(subset=['stress.d'])
df.shape,df.columns


((879, 156),
 Index(['crawl_id', 'created_date', 'posts', 'positive_affect',
        'negative_affect', 'anger', 'anxiety', 'sadness', 'swear',
        'cognitive_mech',
        ...
        '94', '95', '96', '97', '98', '99', '100', 'snapshot_id', 'anxiety.d',
        'stress.d'],
       dtype='object', length=156))

In [5]:
# df["stress.d"].isna().sum()
# df["stress"] = 0
# df["anxiety"] = 0
# # df["stress"].loc(df["stress.d"]>3.0) = 1
# # df[df["anxiety.d"]>3.0]["anxiety"] = 1
# df.loc[df['stress.d'] >2.0, 'stress']  = 1
# df.loc[df['anxiety.d'] >2.0, 'anxiety']  = 1

In [6]:
# df[df["stress.d"]==2.0]["stress"] 

In [7]:
def get_fold(n_records, n_folds, fold):
    import random
    random.seed(1000)
    np.random.seed(1000)
    splits = []
    for i in range(n_folds):
        a = np.random.choice(n_records, n_records,replace=False)
        splits.append([a[:int(0.8*n_records)],a[int(0.8*n_records):]])
    return splits[fold]

def smape(A, F):
    d = np.abs(A) + np.abs(F)
    return (100.0/len(A)) * np.sum( np.abs(F - A) / d)

def madp(A,F):
    return np.sum(np.abs(F-A)/np.abs(A))

def mse(A,B):
    return ((A - B) ** 2).mean()

def distance_error(A,F):
    dis_mean = np.mean(np.abs(A-F))
    return 1.414*dis_mean/(A.max()-A.min())
def calculate_error_clf(y, y_pred):
    res = []
    for i in range(len(y_pred)):
        res.append([metrics.f1_score(y, y_pred[i], average="weighted"),metrics.precision_score(y, y_pred[i], average="weighted"),
                    metrics.recall_score(y, y_pred[i], average="weighted"),metrics.accuracy_score(y, y_pred[i]),mcc(y,y_pred[i])])
#         scores = np.concatenate([
# #             metrics.f1_score(y, y_pred[i], average=None),
# #             mcc(y, y_pred[i]),
#             metrics.precision_score(y, y_pred[i], average=None),
#             metrics.recall_score(y, y_pred[i], average=None)],axis=0)
#             [metrics.accuracy_score(y, y_pred[i])]], axis=0)
#         res.append(scores)
    return res

def calculate_error_reg(y, y_pred):
    res = []
    for i in range(len(y_pred)):
        res.append([smape(y, y_pred[i]),
            pearsonr(y, y_pred[i])[0],
                    spearmanr(y,y_pred[i])[0],
            r2_score(y, y_pred[i])])
    return res


def evaluate_models(models, model_names, training, labels, testing, fold):
    predictions = []
    for i in range(len(models)):
        models[i].fit(training, labels)
        p = models[i].predict(testing)
        predictions.append(p.reshape(-1))
    return predictions
def get_selected_feats_topp(features, labels, p, is_clf):
    print("Select Top P",p)
    if is_clf:
        sel = SelectPercentile(mutual_info_classif, percentile=p)
    else:
        sel = SelectPercentile(mutual_info_regression, percentile=p)
    print("Fitting to Top P")
    start = time.time()
    sel.fit(features, labels)
    print("Time Taken : ",time.time()-start)
    return sel.get_support(indices=True)

def evaluate_folds(df_features, df_labels, df_ids, cols, estimators, estimator_names, k=500, pca=None, is_clf=False):
    all_avgs=[]
    counter = 1
    skf = KFold(n_splits=5, shuffle=True, random_state=1000)
    print('Features passed in', len(df_features.columns.values))
    for col in cols:
        print('=============== Label type',col,'===============')
        labels_mod = df_labels[col]
        testing_metrics = []
        res_folds = []
        counter=1
        for train_ind, test_ind in skf.split(df_features):
            print("Fold" + str(counter))
            counter +=1    
            X_train, X_test = df_features.iloc[train_ind, :].reset_index(drop=True), df_features.iloc[test_ind,:].reset_index(drop=True)
            y_train, y_test = labels_mod.iloc[train_ind], labels_mod.iloc[test_ind]
            ids_train, ids_test = df_ids.iloc[train_ind], df_ids.iloc[test_ind]
            print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
            print(y_train.value_counts())
            print(y_test.value_counts())
            X_train_features = X_train
            X_test_features = X_test
            test_pred = np.array(evaluate_models(estimators, estimator_names, X_train_features, y_train, X_test_features, counter))
            if is_clf:
                testing_metrics.append(calculate_error_clf(y_test, test_pred))
            else:              
                testing_metrics.append(calculate_error_reg(y_test, test_pred))
#             print(ids_test.T.shape, y_test.shape, test_pred.shape)
            res_fold = np.vstack((ids_test.T, y_test, test_pred))
            res_folds.append(res_fold)
        testing_metrics = np.array(testing_metrics)
        print(testing_metrics)
#         return testing_metrics
        avgs = np.mean(testing_metrics, axis=0)
#         print(avgs.shape)
        all_avgs.append(avgs)
        # Save results
        res_folds_all = np.hstack(tuple(res_folds))
#         print(res_folds_all[:5])
        outfile = OUT_DIR+"sm_out_"+col+".csv"
        np.savetxt(outfile, res_folds_all.T, delimiter=",", fmt="%s")

    return all_avgs

In [45]:
if is_clf:
    gb = GradientBoostingClassifier(learning_rate=0.5, n_estimators=1000, random_state=r)
    nn = MLPClassifier(hidden_layer_sizes=(200,200),solver='adam',random_state=r, max_iter=2000)
    svc = SVC(C=0.1,kernel='poly',probability=True, random_state=r)
    rfc = RandomForestClassifier(n_estimators=500,max_features='auto',random_state=r)
    estimators = [gb, svc, rfc]
    estimator_names = ['GB','SVM','RF']
else:
    rfr = RandomForestRegressor(n_estimators=125, random_state=r)
    gbr = GradientBoostingRegressor(random_state=r)
    xgbr = xgb.XGBRegressor(objective ='reg:squarederror', random_state=r, max_depth=100)
#     xgbr = xgb.XGBRegressor(objective ='reg:pseudohubererror', random_state=r)

    estimators = [rfr, gbr, xgbr]
    estimator_names = ['rfr','gbr','xbr']

In [46]:
print(df.columns)
# df = df.drop(['stress.d','anxiety.d'],axis=1)
# print(df.columns)
estimator_names

Index(['crawl_id', 'created_date', 'posts', 'positive_affect',
       'negative_affect', 'anger', 'anxiety', 'sadness', 'swear',
       'cognitive_mech',
       ...
       '94', '95', '96', '97', '98', '99', '100', 'snapshot_id', 'anxiety.d',
       'stress.d'],
      dtype='object', length=156)


['rfr', 'gbr', 'xbr']

In [47]:
pos_labels = ['stress.d','anxiety.d','life_event_yes_no','crawl_id']
labels = ['snapshot_id','created_date']
label_type = ["stress.d", "anxiety.d"]
labels.append(label_type)
feats = [x for x in df.columns.values if x not in labels+pos_labels]
feats = df[feats]
print("Feature shape ", feats.shape)
print(feats.columns)
print(is_clf)

Feature shape  (879, 151)
Index(['posts', 'positive_affect', 'negative_affect', 'anger', 'anxiety',
       'sadness', 'swear', 'cognitive_mech', 'discrepancies', 'inhibition',
       ...
       '91', '92', '93', '94', '95', '96', '97', '98', '99', '100'],
      dtype='object', length=151)
0


In [48]:
sm_res = evaluate_folds(feats, df[label_type] , df[['snapshot_id','created_date']], label_type, estimators, estimator_names, pca=True, is_clf=is_clf)

Features passed in 151
Fold1
(703, 151) (176, 151) (703,) (176,)
1.0    285
2.0    204
3.0    179
4.0     24
5.0     11
Name: stress.d, dtype: int64
1.0    78
2.0    57
3.0    37
4.0     3
5.0     1
Name: stress.d, dtype: int64
Fold2
(703, 151) (176, 151) (703,) (176,)
1.0    292
2.0    200
3.0    174
4.0     25
5.0     12
Name: stress.d, dtype: int64
1.0    71
2.0    61
3.0    42
4.0     2
Name: stress.d, dtype: int64
Fold3
(703, 151) (176, 151) (703,) (176,)
1.0    294
2.0    215
3.0    171
4.0     17
5.0      6
Name: stress.d, dtype: int64
1.0    69
2.0    46
3.0    45
4.0    10
5.0     6
Name: stress.d, dtype: int64
Fold4
(703, 151) (176, 151) (703,) (176,)
1.0    283
2.0    211
3.0    177
4.0     23
5.0      9
Name: stress.d, dtype: int64
1.0    80
2.0    50
3.0    39
4.0     4
5.0     3
Name: stress.d, dtype: int64
Fold5
(704, 151) (175, 151) (704,) (175,)
1.0    298
2.0    214
3.0    163
4.0     19
5.0     10
Name: stress.d, dtype: int64
1.0    65
3.0    53
2.0    47
4.0     8
5

In [49]:
if is_clf:
    tm = pd.DataFrame(sm_res[0])
    tm = tm.rename(index={0:'GB', 1:'SVM', 2:'RF'}, columns = {0:'F1',1:'Precision',2:'Recall',3:'Accuracy',4:'MCC'})
    print("Stress\n",tm)
    tm = pd.DataFrame(sm_res[1])
    tm = tm.rename(index={0:'GB', 1:'SVM', 2:'RF'}, columns = {0:'F1',1:'Precision',2:'Recall',3:'Accuracy',4:'MCC'})
    print("Anxiety\n",tm)
else:
    tm = pd.DataFrame(sm_res[0])
    tm = tm.rename(index={0:'rfr', 1:'gbr', 2:'xbr'}, columns = {0:'SMAPE',1:'Pearson',2:'Spearman',3:'R2'})
    print("Stress\n",tm)
    tm = pd.DataFrame(sm_res[1])
    tm = tm.rename(index={0:'rfr', 1:'gbr', 2:'xbr'}, columns = {0:'SMAPE',1:'Pearson',2:'Spearman',3:'R2'})
    print("\nAnxiety\n",tm)

Stress
          SMAPE   Pearson  Spearman        R2
rfr  21.026421  0.146297  0.099385 -0.060586
gbr  21.195403  0.118363  0.081542 -0.077572
xbr  22.083882  0.084807  0.045498 -0.250149

Anxiety
          SMAPE   Pearson  Spearman        R2
rfr  20.136671  0.174871  0.117380 -0.037407
gbr  19.924270  0.175847  0.164841 -0.022171
xbr  20.129188  0.177314  0.129038 -0.198325


Stress
         SMAPE   Pearson  Spearman        R2
rfr  0.211191  0.107839  0.087510 -0.084117
gbr  0.210426  0.119221  0.101653 -0.057934
xbr  0.208193  0.156896  0.122029 -0.034387
Anxiety
         SMAPE   Pearson  Spearman        R2
rfr  0.200602  0.138481  0.135182 -0.077783
gbr  0.201569  0.120868  0.154095 -0.069693
xbr  0.200832  0.152402  0.157453 -0.032997


Stress
            F1  Precision    Recall  Accuracy       MCC
GB   0.623427   0.615112  0.643903  0.643903  0.055149
SVM  0.590467   0.506597  0.709818  0.709818  0.000000
RF   0.609546   0.639533  0.703013  0.703013  0.051944
Anxiety
            F1  Precision    Recall  Accuracy       MCC
GB   0.793002   0.776816  0.812260  0.812260  0.014520
SVM  0.808349   0.755585  0.869143  0.869143  0.000000
RF   0.816493   0.819927  0.869136  0.869136  0.107051


Stress
            F1  Precision    Recall  Accuracy       MCC
GB   0.351465   0.356364  0.361740  0.361740  0.043945
SVM  0.242043   0.171484  0.412922  0.412922  0.000000
RF   0.333832   0.353209  0.390169  0.390169  0.026406
Anxiety
            F1  Precision    Recall  Accuracy       MCC
GB   0.413297   0.409118  0.431169  0.431169  0.004855
SVM  0.361130   0.276656  0.523214  0.523214  0.000000
RF   0.392926   0.388294  0.466396  0.466396 -0.033853


Stress
            F1
GB   0.043945
SVM  0.000000
RF   0.026406
Anxiety
            F1
GB   0.004855
SVM  0.000000
RF  -0.033853


In [13]:
sm_res

[array([[20.98474701,  0.14355997,  0.09907613, -0.06120603],
        [21.19540336,  0.11836271,  0.08154206, -0.07757248],
        [20.91236967,  0.15391654,  0.12611835, -0.04493476]]),
 array([[20.0426375 ,  0.18671116,  0.13339477, -0.02453432],
        [19.92426956,  0.17584693,  0.1648412 , -0.02217106],
        [20.03332646,  0.15994421,  0.15031704, -0.02761903]])]

In [14]:
a=[1,2,3,4,5]
b=[5,6,7,8,7]
mcc(a,b)

-0.05270462766947299

In [15]:
np.mean(sm_res[1:3],axis=0)
sm_res[1]

array([[20.0426375 ,  0.18671116,  0.13339477, -0.02453432],
       [19.92426956,  0.17584693,  0.1648412 , -0.02217106],
       [20.03332646,  0.15994421,  0.15031704, -0.02761903]])

In [16]:
# metrics.f1_score(a, b, average=None)
#             mcc(a, b),
metrics.precision_score(a,b, average=None)
#             metrics.recall_score(y, y_pred[i], average=None),
#             metrics.accuracy_score(y, y_pred[i])], axis=0)

array([0., 0., 0., 0., 0., 0., 0., 0.])

In [17]:
scores = np.concatenate([
metrics.f1_score(a,b, average=None),
#             mcc(y, y_pred[i]),
metrics.precision_score(a,b, average=None),
metrics.recall_score(a,b, average=None),
[metrics.accuracy_score(a,b)]], axis=0)
# res.append(scores)

In [18]:
metrics.f1_score(a,b, average=None).shape
metrics.precision_score(a,b, average=None).shape
metrics.recall_score(a,b, average=None).shape
len([metrics.accuracy_score(a,b)])

1

In [19]:
scores[2]


0.0