Notes: This a notebook showing the data processing and modeling based on a **make-up synthetic dataset.**

The purpose of this notebook is to show how the code will work to support our paper "Does Location Matter? The Dominant Role of Temporal Features in Predicting Smoking Events", ensuring the reproducibility of our work. The results do not present the real content of our actual results but the format and frame of it.

If you have any questions, feel free to report an issue within our github repository, or contacting yang8597@umn.edu.

# Data Processing

In [1]:
from functions import *
from tqdm import tqdm
import pandas as pd

df_merged = pd.read_csv('./synthetic_raw_data.csv')

quarter_hours = process_timestamps(df_merged, 'timestamp')
df_quarter_hours = pd.DataFrame(quarter_hours, columns=['day_of_week', 'is_weekend', 'season'] + [f'time_quarter_{i}' for i in range(96)])

df_merged_1 = pd.concat([df_merged, df_quarter_hours], axis=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 74204.68it/s]


In [3]:
df_merged_2 = process_location_info(df_merged_1, 'person_id', 'longitude', 'latitude')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 46/46 [00:00<00:00, 138.23it/s]


In [8]:
Counter(df_merged_2['substance'])

Counter({nan: 40139, 'cigarette': 9861})

## Modeling

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, accuracy_score, balanced_accuracy_score
from imblearn.over_sampling import SMOTE
df_modeling = df_merged_2.drop_duplicates()
df_modeling['substance'] = df_modeling['substance'].apply(lambda x: 0 if pd.isna(x) else 1)
df_modeling.shape, df_modeling.columns, df_modeling.head(), Counter(df_modeling['person_id']), len(set(df_modeling['person_id']))

((50000, 106),
 Index(['person_id', 'timestamp', 'longitude', 'latitude', 'substance',
        'is_after_covid', 'day_of_week', 'is_weekend', 'season',
        'time_quarter_0',
        ...
        'time_quarter_87', 'time_quarter_88', 'time_quarter_89',
        'time_quarter_90', 'time_quarter_91', 'time_quarter_92',
        'time_quarter_93', 'time_quarter_94', 'time_quarter_95',
        'cluster_label'],
       dtype='object', length=106),
   person_id   timestamp   longitude   latitude  substance  is_after_covid  \
 0   id0y9do  1643639679 -112.409096  34.456482          0               0   
 1   id0y9do  1643639681 -112.179556  34.880768          0               0   
 2   id0y9do  1643639697 -111.601791  34.599296          0               0   
 3   id0y9do  1643639714 -112.349110  34.955633          0               0   
 4   id0y9do  1643639773 -113.090380  34.687343          0               0   
 
    day_of_week  is_weekend  season  time_quarter_0  ...  time_quarter_87  \
 0    

### Correlation analysis between geo and temporal feature, mixed lm
Examplified with smoking group, in quarter-hour intervals.

In [26]:
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm

df_smk = df_modeling[df_modeling['substance']==1]
df_smk.reset_index(inplace=True)
del df_smk['index']

df_smk0 = df_smk[['person_id','cluster_label', 'day_of_week', 'is_weekend', 'season', 'longitude', 'latitude', 'time_quarter_0', 'time_quarter_1', 'time_quarter_2', 'time_quarter_3', 'time_quarter_4', 'time_quarter_5', 'time_quarter_6', 'time_quarter_7', 'time_quarter_8', 'time_quarter_9', 'time_quarter_10', 'time_quarter_11', 'time_quarter_12', 'time_quarter_13', 'time_quarter_14', 'time_quarter_15', 'time_quarter_16', 'time_quarter_17', 'time_quarter_18', 'time_quarter_19', 'time_quarter_20', 'time_quarter_21', 'time_quarter_22', 'time_quarter_23', 'time_quarter_24', 'time_quarter_25', 'time_quarter_26', 'time_quarter_27', 'time_quarter_28', 'time_quarter_29', 'time_quarter_30', 'time_quarter_31', 'time_quarter_32', 'time_quarter_33', 'time_quarter_34', 'time_quarter_35', 'time_quarter_36', 'time_quarter_37', 'time_quarter_38', 'time_quarter_39', 'time_quarter_40', 'time_quarter_41', 'time_quarter_42', 'time_quarter_43', 'time_quarter_44', 'time_quarter_45', 'time_quarter_46', 'time_quarter_47', 'time_quarter_48', 'time_quarter_49', 'time_quarter_50', 'time_quarter_51', 'time_quarter_52', 'time_quarter_53', 'time_quarter_54', 'time_quarter_55', 'time_quarter_56', 'time_quarter_57', 'time_quarter_58', 'time_quarter_59', 'time_quarter_60', 'time_quarter_61', 'time_quarter_62', 'time_quarter_63', 'time_quarter_64', 'time_quarter_65', 'time_quarter_66', 'time_quarter_67', 'time_quarter_68', 'time_quarter_69', 'time_quarter_70', 'time_quarter_71', 'time_quarter_72', 'time_quarter_73', 'time_quarter_74', 'time_quarter_75', 'time_quarter_76', 'time_quarter_77', 'time_quarter_78', 'time_quarter_79', 'time_quarter_80', 'time_quarter_81', 'time_quarter_82', 'time_quarter_83', 'time_quarter_84', 'time_quarter_85', 'time_quarter_86', 'time_quarter_87', 'time_quarter_88', 'time_quarter_89', 'time_quarter_90', 'time_quarter_91', 'time_quarter_92', 'time_quarter_93', 'time_quarter_94', 'time_quarter_95']]
print(len(set(df_smk0['person_id'])))
# Get rid of the participants with only one cluster label
pids = list(set(df_smk0['person_id']))
df_smk1 = pd.DataFrame([], columns = df_smk0.columns)
for idx,pid in enumerate(tqdm(pids)):
    df_temp=df_smk0[df_smk0['person_id']==pid]
    set_len_cluster = len(set(df_temp['cluster_label']))
    if set_len_cluster<=1:
        continue
    df_smk1 = pd.concat([df_smk1,df_temp],axis=0)
print(len(set(df_smk1['person_id'])))

time_quarter_columns = [f"time_quarter_{i}" for i in range(96)]

results_data=[]
for col in tqdm(['day_of_week', 'is_weekend', 'season']+time_quarter_columns):
    formula = f"cluster_label ~ {col}"
    supp = len(df_smk1[df_smk1[col]!=0])
    try:
        # Fit the mixed-effects model
        model = mixedlm(formula, df_smk1, groups=df_smk1["person_id"])
        result = model.fit()
        # Extract coefficients and confidence intervals for each model
        coef_summary = dict(result.summary().tables[1].iloc[1,:])  # Skip header row
        coef_data = {
            'Variable': col,
            'Support': supp, 
            'Coef.': float(coef_summary['Coef.']),
            'Std.Err.': float(coef_summary['Std.Err.']),
            'z': float(coef_summary['z']),
            'P>|z|': float(coef_summary['P>|z|']),
            '0.025': float(coef_summary['[0.025']),
            '0.975': float(coef_summary['0.975]'])
        }
        results_data.append(coef_data)

    except Exception as e:
        print(f"Model fitting failed for {col}: {e}")
        coef_data = {
            'Variable': col,
            'Support': supp, 
            'Coef.': np.nan,
            'Std.Err.': np.nan,
            'z': np.nan,
            'P>|z|': np.nan,
            '0.025': np.nan,
            '0.975': np.nan
        }
        results_data.append(coef_data)

# Convert the results into a DataFrame
results_df = pd.DataFrame(results_data)

results_df.head(20)

46


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 46/46 [00:00<00:00, 797.70it/s]


0


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99/99 [00:00<00:00, 1084.76it/s]

Model fitting failed for day_of_week: negative dimensions are not allowed
Model fitting failed for is_weekend: negative dimensions are not allowed
Model fitting failed for season: negative dimensions are not allowed
Model fitting failed for time_quarter_0: negative dimensions are not allowed
Model fitting failed for time_quarter_1: negative dimensions are not allowed
Model fitting failed for time_quarter_2: negative dimensions are not allowed
Model fitting failed for time_quarter_3: negative dimensions are not allowed
Model fitting failed for time_quarter_4: negative dimensions are not allowed
Model fitting failed for time_quarter_5: negative dimensions are not allowed
Model fitting failed for time_quarter_6: negative dimensions are not allowed
Model fitting failed for time_quarter_7: negative dimensions are not allowed
Model fitting failed for time_quarter_8: negative dimensions are not allowed
Model fitting failed for time_quarter_9: negative dimensions are not allowed
Model fitting 




Unnamed: 0,Variable,Support,Coef.,Std.Err.,z,P>|z|,0.025,0.975
0,day_of_week,0,,,,,,
1,is_weekend,0,,,,,,
2,season,0,,,,,,
3,time_quarter_0,0,,,,,,
4,time_quarter_1,0,,,,,,
5,time_quarter_2,0,,,,,,
6,time_quarter_3,0,,,,,,
7,time_quarter_4,0,,,,,,
8,time_quarter_5,0,,,,,,
9,time_quarter_6,0,,,,,,


## Logistic Regression
Examplified with 15 minutes HALF_DURATION, i.e. one smoking event will last 30 minutes, centered the datetime point of the smoker's report.

In [33]:
features_all = ['day_of_week', 'is_weekend', 'season', 'time_quarter_0', 'time_quarter_1', 'time_quarter_2', 'time_quarter_3', 'time_quarter_4', 'time_quarter_5', 'time_quarter_6', 'time_quarter_7', 'time_quarter_8', 'time_quarter_9', 'time_quarter_10', 'time_quarter_11', 'time_quarter_12', 'time_quarter_13', 'time_quarter_14', 'time_quarter_15', 'time_quarter_16', 'time_quarter_17', 'time_quarter_18', 'time_quarter_19', 'time_quarter_20', 'time_quarter_21', 'time_quarter_22', 'time_quarter_23', 'time_quarter_24', 'time_quarter_25', 'time_quarter_26', 'time_quarter_27', 'time_quarter_28', 'time_quarter_29', 'time_quarter_30', 'time_quarter_31', 'time_quarter_32', 'time_quarter_33', 'time_quarter_34', 'time_quarter_35', 'time_quarter_36', 'time_quarter_37', 'time_quarter_38', 'time_quarter_39', 'time_quarter_40', 'time_quarter_41', 'time_quarter_42', 'time_quarter_43', 'time_quarter_44', 'time_quarter_45', 'time_quarter_46', 'time_quarter_47', 'time_quarter_48', 'time_quarter_49', 'time_quarter_50', 'time_quarter_51', 'time_quarter_52', 'time_quarter_53', 'time_quarter_54', 'time_quarter_55', 'time_quarter_56', 'time_quarter_57', 'time_quarter_58', 'time_quarter_59', 'time_quarter_60', 'time_quarter_61', 'time_quarter_62', 'time_quarter_63', 'time_quarter_64', 'time_quarter_65', 'time_quarter_66', 'time_quarter_67', 'time_quarter_68', 'time_quarter_69', 'time_quarter_70', 'time_quarter_71', 'time_quarter_72', 'time_quarter_73', 'time_quarter_74', 'time_quarter_75', 'time_quarter_76', 'time_quarter_77', 'time_quarter_78', 'time_quarter_79', 'time_quarter_80', 'time_quarter_81', 'time_quarter_82', 'time_quarter_83', 'time_quarter_84', 'time_quarter_85', 'time_quarter_86', 'time_quarter_87', 'time_quarter_88', 'time_quarter_89', 'time_quarter_90', 'time_quarter_91', 'time_quarter_92', 'time_quarter_93', 'time_quarter_94', 'time_quarter_95'
                , 'cluster_label']
features_wo_location = ['day_of_week', 'is_weekend', 'season', 'time_quarter_0', 'time_quarter_1', 'time_quarter_2', 'time_quarter_3', 'time_quarter_4', 'time_quarter_5', 'time_quarter_6', 'time_quarter_7', 'time_quarter_8', 'time_quarter_9', 'time_quarter_10', 'time_quarter_11', 'time_quarter_12', 'time_quarter_13', 'time_quarter_14', 'time_quarter_15', 'time_quarter_16', 'time_quarter_17', 'time_quarter_18', 'time_quarter_19', 'time_quarter_20', 'time_quarter_21', 'time_quarter_22', 'time_quarter_23', 'time_quarter_24', 'time_quarter_25', 'time_quarter_26', 'time_quarter_27', 'time_quarter_28', 'time_quarter_29', 'time_quarter_30', 'time_quarter_31', 'time_quarter_32', 'time_quarter_33', 'time_quarter_34', 'time_quarter_35', 'time_quarter_36', 'time_quarter_37', 'time_quarter_38', 'time_quarter_39', 'time_quarter_40', 'time_quarter_41', 'time_quarter_42', 'time_quarter_43', 'time_quarter_44', 'time_quarter_45', 'time_quarter_46', 'time_quarter_47', 'time_quarter_48', 'time_quarter_49', 'time_quarter_50', 'time_quarter_51', 'time_quarter_52', 'time_quarter_53', 'time_quarter_54', 'time_quarter_55', 'time_quarter_56', 'time_quarter_57', 'time_quarter_58', 'time_quarter_59', 'time_quarter_60', 'time_quarter_61', 'time_quarter_62', 'time_quarter_63', 'time_quarter_64', 'time_quarter_65', 'time_quarter_66', 'time_quarter_67', 'time_quarter_68', 'time_quarter_69', 'time_quarter_70', 'time_quarter_71', 'time_quarter_72', 'time_quarter_73', 'time_quarter_74', 'time_quarter_75', 'time_quarter_76', 'time_quarter_77', 'time_quarter_78', 'time_quarter_79', 'time_quarter_80', 'time_quarter_81', 'time_quarter_82', 'time_quarter_83', 'time_quarter_84', 'time_quarter_85', 'time_quarter_86', 'time_quarter_87', 'time_quarter_88', 'time_quarter_89', 'time_quarter_90', 'time_quarter_91', 'time_quarter_92', 'time_quarter_93', 'time_quarter_94', 'time_quarter_95']
features_wo_time = ['day_of_week', 'is_weekend', 'season', 'cluster_label']
after_covid_ids = list(set(df_modeling[df_modeling['is_after_covid']==1]))
def process_feature_importance_structure(raw_feature_importance, person_id):
    feature_importance_t = raw_feature_importance.set_index('feature').T.reset_index().rename(columns={'index':'person_id'})
    feature_importance_t['person_id'] = person_id
    feature_importance_tt = feature_importance_t.reset_index(drop=True)
    feature_importance_tt.columns.names=[None]
    return feature_importance_tt

In [34]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline
from sklearn.metrics import make_scorer

person_ids = list(set(df_modeling['person_id']))

def custom_scoring(y_true, y_pred, y_pred_prob):
    cm = confusion_matrix(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    ras = roc_auc_score(y_true, y_pred_prob)
    return cm.flatten().tolist() + [macro_f1, ras, acc, balanced_acc]

feature_importances_all = pd.DataFrame([], columns=['person_id'] + features_all)
feature_importances_wo_location = pd.DataFrame([], columns=['person_id'] + features_wo_location)
feature_importances_wo_time = pd.DataFrame([], columns=['person_id'] + features_wo_time)

eva_columns = ['person_id', 'Is_after_covid','reported_case_counts', 'sample_size', 'positive_cases',
               'all_features_confusion_matrix_00', 'all_features_confusion_matrix_01',
                'all_features_confusion_matrix_10', 'all_features_confusion_matrix_11',
                'all_features_macro_f1', 'all_features_roc_auc_score', 'all_features_accuracy', 'all_features_balanced_accuracy',
                'wo_location_confusion_matrix_00', 'wo_location_confusion_matrix_01',
                'wo_location_confusion_matrix_10', 'wo_location_confusion_matrix_11',
                'wo_location_macro_f1', 'wo_location_roc_auc_score', 'wo_location_accuracy', 'wo_location_balanced_accuracy',
                'wo_time_confusion_matrix_00', 'wo_time_confusion_matrix_01',
                'wo_time_confusion_matrix_10', 'wo_time_confusion_matrix_11',
                'wo_time_macro_f1', 'wo_time_roc_auc_score', 'wo_time_accuracy', 'wo_location_balanced_accuracy']
evaluations = pd.DataFrame([], columns= eva_columns)
for idx,p in enumerate(tqdm(person_ids)):
    ll = len(df_merged_2[df_merged_2['person_id']==p])
    # '3n49pu7','3wqwm47', # Too few samples to skip (*2), when 10min
    # 'i4109ac' when both 10min and 15min, after last cigareet extraction
    if p in ['n4h80yi', 'i5v361b',  '4u8ihdn', '006', 'tmz8lam', 'i4109ac']: # Too few samples to skip (*2); Only have one class in outcome (*3);
        continue
    # 1. All features
    X = df_modeling[df_modeling['person_id']==p].drop(columns=['person_id', 'substance'])
    y = df_modeling[df_modeling['person_id']==p]['substance']
    if p in after_covid_ids:
        iac = 1
    else:
        iac = 0
    evaluations_list = [p, iac, ll, len(y), dict(Counter(y))[1]]
#     # Split the data into train and test sets
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23, stratify=y)
#     # Handle imbalance using SMOTE (Synthetic Minority Over-sampling Technique)
#     smote = SMOTE(random_state=42)
#     X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
#     # Train the logistic regression model
#     model = LogisticRegression(max_iter=3000, class_weight='balanced')
#     model.fit(X_train_sm, y_train_sm)
#     # Make predictions
    # Create a stratified k-fold object. k=5 here
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Define a pipeline with SMOTE and Logistic Regression
    pipeline = Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('logreg', LogisticRegression(max_iter=3000, class_weight='balanced'))
    ])
    
    y_pred = cross_val_predict(pipeline, X, y, cv=cv, method='predict')
    y_pred_prob = cross_val_predict(pipeline, X, y, cv=cv, method='predict_proba')[:, 1]

#     # Evaluate the model
#     cm = confusion_matrix(y_test, y_pred)
#     evaluations_list+=list(cm.flatten())
#     cr = classification_report(y_test, y_pred)
#     macro_f1 = f1_score(y_test, y_pred, average='macro')
#     acc, balanced_acc = accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)
#     ras = roc_auc_score(y_test, y_pred_prob)
#     evaluations_list+=[macro_f1, ras, acc, balanced_acc]
#     # Feature importances
#     feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.coef_[0]})
#     tmp_feature_importances_all = process_feature_importance_structure(feature_importance, p)
#     feature_importances_all = pd.concat([feature_importances_all, tmp_feature_importances_all], axis=0)
    
    # Evaluate the model using custom scoring
    evaluation_metrics = custom_scoring(y, y_pred, y_pred_prob)
    evaluations_list += evaluation_metrics

    # Feature importances using the pipeline
    pipeline.fit(X, y)  # Fit the pipeline once to get feature importances
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': pipeline.named_steps['logreg'].coef_[0]
    })

    # Process feature importances
    tmp_feature_importances_all = process_feature_importance_structure(feature_importance, p)
    feature_importances_all = pd.concat([feature_importances_all, tmp_feature_importances_all], axis=0)
    
    # 2. Without location information
    X = df_modeling[df_modeling['person_id']==p].drop(columns=['person_id', 'substance', 
#                                                                'longitude', 'latitude',
                                                               'cluster_label'])
    y = df_modeling[df_modeling['person_id']==p]['substance']
#     # Split the data into train and test sets
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23, stratify=y)
#     # Handle imbalance using SMOTE (Synthetic Minority Over-sampling Technique)
#     smote = SMOTE(random_state=42)
#     X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
#     # Train the logistic regression model
#     model = LogisticRegression(max_iter=3000, class_weight='balanced')
#     model.fit(X_train_sm, y_train_sm)
#     # Make predictions
#     y_pred = model.predict(X_test)
#     y_pred_prob = model.predict_proba(X_test)[:, 1]
#     # Evaluate the model
#     cm = confusion_matrix(y_test, y_pred)
#     evaluations_list+=list(cm.flatten())
#     cr = classification_report(y_test, y_pred)
#     macro_f1 = f1_score(y_test, y_pred, average='macro')
#     acc, balanced_acc = accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)
#     ras = roc_auc_score(y_test, y_pred_prob)
#     evaluations_list+=[macro_f1, ras, acc, balanced_acc]
#     # Feature importances
#     feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.coef_[0]})
#     tmp_feature_importances_wo_location = process_feature_importance_structure(feature_importance, p)
#     feature_importances_wo_location = pd.concat([feature_importances_wo_location, tmp_feature_importances_wo_location], axis=0)
    
    # Create a stratified k-fold object. k=5 here
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Define a pipeline with SMOTE and Logistic Regression
    pipeline = Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('logreg', LogisticRegression(max_iter=3000, class_weight='balanced'))
    ])
    
    y_pred = cross_val_predict(pipeline, X, y, cv=cv, method='predict')
    y_pred_prob = cross_val_predict(pipeline, X, y, cv=cv, method='predict_proba')[:, 1]
    # Evaluate the model using custom scoring
    evaluation_metrics = custom_scoring(y, y_pred, y_pred_prob)
    evaluations_list += evaluation_metrics

    # Feature importances using the pipeline
    pipeline.fit(X, y)  # Fit the pipeline once to get feature importances
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': pipeline.named_steps['logreg'].coef_[0]
    })

    # Process feature importances
    tmp_feature_importances_wo_location = process_feature_importance_structure(feature_importance, p)
    feature_importances_wo_location = pd.concat([feature_importances_wo_location, tmp_feature_importances_wo_location], axis=0)
    
    
    # 3. Without time information
    X = df_modeling[df_modeling['person_id']==p].drop(columns=['person_id', 'substance', 
                                                               'time_quarter_0', 'time_quarter_1', 'time_quarter_2', 'time_quarter_3', 
                                                               'time_quarter_4', 'time_quarter_5', 'time_quarter_6', 'time_quarter_7', 
                                                               'time_quarter_8', 'time_quarter_9', 'time_quarter_10', 'time_quarter_11', 
                                                               'time_quarter_12', 'time_quarter_13', 'time_quarter_14', 'time_quarter_15', 
                                                               'time_quarter_16', 'time_quarter_17', 'time_quarter_18', 'time_quarter_19', 
                                                               'time_quarter_20', 'time_quarter_21', 'time_quarter_22', 'time_quarter_23', 
                                                               'time_quarter_24', 'time_quarter_25', 'time_quarter_26', 'time_quarter_27', 
                                                               'time_quarter_28', 'time_quarter_29', 'time_quarter_30', 'time_quarter_31', 
                                                               'time_quarter_32', 'time_quarter_33', 'time_quarter_34', 'time_quarter_35', 
                                                               'time_quarter_36', 'time_quarter_37', 'time_quarter_38', 'time_quarter_39', 
                                                               'time_quarter_40', 'time_quarter_41', 'time_quarter_42', 'time_quarter_43', 
                                                               'time_quarter_44', 'time_quarter_45', 'time_quarter_46', 'time_quarter_47', 
                                                               'time_quarter_48', 'time_quarter_49', 'time_quarter_50', 'time_quarter_51', 
                                                               'time_quarter_52', 'time_quarter_53', 'time_quarter_54', 'time_quarter_55', 
                                                               'time_quarter_56', 'time_quarter_57', 'time_quarter_58', 'time_quarter_59', 
                                                               'time_quarter_60', 'time_quarter_61', 'time_quarter_62', 'time_quarter_63', 
                                                               'time_quarter_64', 'time_quarter_65', 'time_quarter_66', 'time_quarter_67', 
                                                               'time_quarter_68', 'time_quarter_69', 'time_quarter_70', 'time_quarter_71', 
                                                               'time_quarter_72', 'time_quarter_73', 'time_quarter_74', 'time_quarter_75', 
                                                               'time_quarter_76', 'time_quarter_77', 'time_quarter_78', 'time_quarter_79', 
                                                               'time_quarter_80', 'time_quarter_81', 'time_quarter_82', 'time_quarter_83', 
                                                               'time_quarter_84', 'time_quarter_85', 'time_quarter_86', 'time_quarter_87', 
                                                               'time_quarter_88', 'time_quarter_89', 'time_quarter_90', 'time_quarter_91', 
                                                               'time_quarter_92', 'time_quarter_93', 'time_quarter_94', 'time_quarter_95'])

    y = df_modeling[df_modeling['person_id']==p]['substance']
#     # Split the data into train and test sets
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23, stratify=y)
#     # Handle imbalance using SMOTE (Synthetic Minority Over-sampling Technique)
#     smote = SMOTE(random_state=42)
#     X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
#     # Train the logistic regression model
#     model = LogisticRegression(max_iter=3000, class_weight='balanced')
#     model.fit(X_train_sm, y_train_sm)
#     # Make predictions
#     y_pred = model.predict(X_test)
#     y_pred_prob = model.predict_proba(X_test)[:, 1]
#     # Evaluate the model
#     cm = confusion_matrix(y_test, y_pred)
#     evaluations_list+=list(cm.flatten())
#     cr = classification_report(y_test, y_pred)
#     macro_f1 = f1_score(y_test, y_pred, average='macro')
#     acc, balanced_acc = accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)
#     ras = roc_auc_score(y_test, y_pred_prob)
#     evaluations_list+=[macro_f1, ras, acc, balanced_acc]
#     # Feature importances
#     feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.coef_[0]})
#     tmp_feature_importances_wo_time = process_feature_importance_structure(feature_importance, p)
#     feature_importances_wo_time = pd.concat([feature_importances_wo_time, tmp_feature_importances_wo_time], axis=0)
    
    # Create a stratified k-fold object. k=5 here
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Define a pipeline with SMOTE and Logistic Regression
    pipeline = Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('logreg', LogisticRegression(max_iter=3000, class_weight='balanced'))
    ])
    
    y_pred = cross_val_predict(pipeline, X, y, cv=cv, method='predict')
    y_pred_prob = cross_val_predict(pipeline, X, y, cv=cv, method='predict_proba')[:, 1]
    # Evaluate the model using custom scoring
    evaluation_metrics = custom_scoring(y, y_pred, y_pred_prob)
    evaluations_list += evaluation_metrics

    # Feature importances using the pipeline
    pipeline.fit(X, y)  # Fit the pipeline once to get feature importances
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': pipeline.named_steps['logreg'].coef_[0]
    })

    # Process feature importances
    tmp_feature_importances_wo_time = process_feature_importance_structure(feature_importance, p)
    feature_importances_wo_time = pd.concat([feature_importances_wo_time, tmp_feature_importances_wo_time], axis=0)
    
    df_tmp_eva = pd.DataFrame([evaluations_list], columns=eva_columns)
    evaluations = pd.concat([evaluations, df_tmp_eva], axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 46/46 [00:46<00:00,  1.01s/it]


In [36]:
suffix = "_15_min"

In [37]:
feature_importances_all.to_excel(f'./synthetic_data_5-fold_CV/synthetic_data_feature_importances_all_logistic_regression{suffix}.xlsx', index=False)   
feature_importances_wo_location.to_excel(f'./synthetic_data_5-fold_CV/synthetic_data_feature_importances_wo_location_logistic_regression{suffix}.xlsx', index=False)
feature_importances_wo_time.to_excel(f'./synthetic_data_5-fold_CV/synthetic_data_feature_importances_wo_time_logistic_regression{suffix}.xlsx', index=False) 
evaluations.to_excel(f'./synthetic_data_5-fold_CV/synthetic_data_Evaluation_metrics_logistic_regression{suffix}.xlsx', index=False)