In [1]:
!pip3 install statsmodels
!pip3 install vaderSentiment
!pip3 install nbconvert

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import numpy as np
import pandas as pd

from datetime import date, timedelta
import datetime

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D

from sklearn.preprocessing import LabelEncoder

import statsmodels.api as sm
import statsmodels.formula.api as smf
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import csv

In [3]:
def convertToTime(row, columnName):
    return datetime.datetime.strptime(row[columnName], "%Y-%m-%d").date()

def convertToDate(row, columnName):
    return datetime.datetime.strptime(row[columnName], "%Y-%m-%d %H:%M:%S").date()

def getDays(row, beginColumnName, endColumnName):
    v = datetime.datetime.strptime(row[endColumnName], "%Y-%m-%d").date() - datetime.datetime.strptime(row[beginColumnName], "%Y-%m-%d").date()
    return v.days

def lookup_index(row, columnName, array):
    if(row[columnName] not in array):
        return -1
    return array.index(row[columnName]) + 1

def colour_life_events(row):
    colours = {'personal':'lightcoral', 'health':'orange', 'work':'lightgreen', 'financial':'teal', 'weather':'blueviolet', 'societal':'navy','other':'skyblue'}
    return colours[row['life_event_type']]

def remove_rows(base_df, other_df):
    modified_df = other_df.drop(other_df[other_df['snapshot_id'] not in base_df['snapshot_id'].values].index)
    return modified_df

def fix_signficance(row):
    if('significance' in row['valence']):
        return row['valence']
    else:
        return row['significance']

def fix_valence(row):
    if('significance' in row['significance']):
        return row['valence']
    else:
        return row['significance']  

def get_broad_category(row, categories, column_name):
    if(row[column_name] in categories):
        return categories[row[column_name]]
    return "UNKNOWN"
    
def compute_sentiment(row):
    post = row['Text']
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(post)
    sentiment = 0
    if (vs["neu"]>0.8):
        sentiment = 0
    elif (vs["pos"]==vs["neg"]):
        sentiment = 0
    elif (vs["pos"]>vs["neg"]):
        sentiment = 1
    elif (vs["neg"]>vs["neu"]):
        sentiment = -1
    return sentiment

def convert_valence_to_sentiment(row):
    valence = row['valence']
    retVal = 0
    if (valence == 'Neither Positive or Negative'):
        retVal = 0
    elif("Positive" in valence):
        retVal = 1
    elif("Negative" in valence):
        retVal = -1
    return retVal


In [4]:
def load_demographics_data():
    demographics_data = pd.read_csv('data/igtbs_demographics_complete.csv', parse_dates=True)
    demographics_data = demographics_data[['age','gender','snapshot_id', 'shipley.vocab', 'shipley.abs', 'openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism','pos.affect','neg.affect','stai.trait']]
    return demographics_data

In [5]:
def load_self_reported_categories():
    df_self_reported_categories = pd.read_csv('data/Life Events Categories Mapping - Self-Reported Categories.csv')
    return df_self_reported_categories

def load_life_events_data():
    df_life_events = pd.read_csv('data/Superimposed/LifeEvents_Curated_non_blinded.csv', parse_dates=True)
    df_life_events = df_life_events[['snapshot_id', 'description','UpdatedBeginDate', 'UpdatedEndDate', 'date_confidence','life_event_type', 'work_perf_impact', 'significance','valence', 'ended_or_ongoing']]
    df_life_events = df_life_events.drop(df_life_events[((df_life_events['UpdatedBeginDate'].isnull() == True) | (df_life_events['UpdatedEndDate'].isnull() == True))].index)
    df_life_events.replace({'valence': {np.nan: 'Neither Positive or Negative'}, 'significance': {np.nan: 'Neither Positive or Negative'}, 'date_confidence': {np.nan: 'Moderate confidence'}}, inplace=True)

    df_life_events['num_of_days'] = df_life_events.apply(getDays, endColumnName='UpdatedEndDate', beginColumnName='UpdatedBeginDate', axis=1)
    df_life_events['UpdatedBeginDate_time'] = df_life_events.apply(convertToTime, columnName='UpdatedBeginDate', axis=1)
    df_life_events['colour'] = df_life_events.apply(colour_life_events, axis=1)
    df_life_events['fixed_signficance'] = df_life_events.apply(fix_signficance, axis = 1)
    df_life_events['fixed_valence'] = df_life_events.apply(fix_valence, axis = 1)
    df_life_events = df_life_events.drop(columns = ['valence', 'significance'])
    df_life_events = df_life_events.rename(columns={"fixed_signficance": "significance", "fixed_valence": "valence"})
    df_life_events['sentiment'] = df_life_events.apply(convert_valence_to_sentiment, axis=1)
    df_life_events = df_life_events[['snapshot_id', 'description', 'UpdatedBeginDate', 'UpdatedEndDate', 'significance', 'sentiment']]
    
    le_significance = LabelEncoder()
    le_significance.fit(df_life_events['significance'].values)
    
    df_life_events['significance_label'] = df_life_events.apply(lambda x: le_significance.transform([x['significance']])[0], axis=1)

    df_life_events = df_life_events.drop(columns=['significance'])

    return df_life_events

def merge_life_event_with_reported_categories(df_life_events, df_self_reported_categories):
    life_events_with_categories = pd.merge(df_life_events, df_self_reported_categories, how="inner", left_on="description", right_on="SR_LifeEvent")
    life_events_with_categories = life_events_with_categories.drop(columns=['description', 'SR_LifeEvent', 'LifeEventFinal', 'LifeEventFamily2'])

    return life_events_with_categories

In [6]:
def load_social_media_categories():
    df_social_media_categories = pd.read_csv('data/Life Events Categories Mapping - Social Media Categories-2.csv')
    return df_social_media_categories

def load_social_media_data():
    df_social_media_data = pd.read_csv('data/Superimposed/Facebook Data For Life Events-Combined - FB Data.csv')
    df_social_media_data = df_social_media_data[['snapshot_id', 'date_enrolled', 'created_time', 'Text', 'final_life_event_category_2']]
    df_social_media_data = df_social_media_data.replace({'PostiveMove':'PositiveMove', 'Negative Move':'NegativeMove'})
    df_social_media_data = df_social_media_data.drop(df_social_media_data[((df_social_media_data['final_life_event_category_2'].isnull() == True))].index)
    df_social_media_data['created_date'] = df_social_media_data.apply(convertToDate, columnName='created_time', axis=1)
    df_social_media_data['sentiment'] = df_social_media_data.apply(compute_sentiment, axis=1)
    df_social_media_data = df_social_media_data.drop(columns=['date_enrolled','created_time','Text'])
    
    return df_social_media_data

def merge_social_media_data_with_categories(df_social_media_data, df_social_media_categories):
    df_social_media_data_with_categories = pd.merge(df_social_media_data, df_social_media_categories, how="inner", left_on='final_life_event_category_2', right_on='SM_LifeEvent')
    df_social_media_data_with_categories = df_social_media_data_with_categories.drop(columns=['final_life_event_category_2','SM_LifeEvent','LifeEventFamily2','Comments','SignificanceRank'])
    return df_social_media_data_with_categories

In [7]:
def load_dailies_data():
    df_dailies = pd.read_csv('data/Superimposed/dailies_scores.csv', low_memory=False)
    df_dailies = df_dailies[['snapshot_id','day', 'alc_status', 'alc.quantity.d', 'anxiety.d', 'pos.affect.d', 'neg.affect.d','sleep.d', 'stress.d']]
    df_dailies['day_time'] = df_dailies.apply(convertToTime, columnName='day', axis=1)
    
    return df_dailies

In [8]:
def calculate_mean_dailies_data(df_dailies, dependent_variable):
    df_dailies = df_dailies.drop(df_dailies[((df_dailies[dependent_variable].isnull()))].index)
    mean_by_snapshot_id_df = df_dailies.groupby('snapshot_id', as_index=False).mean()

    return mean_by_snapshot_id_df

In [9]:
def build_df_for_regression_life_events(df_life_events, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable):
    mean_value_df = mean_by_snapshot_id_df[['snapshot_id', dependent_variable]]
    merged_data = pd.merge(df_life_events, df_demographics, how="inner", on=["snapshot_id"])

    final_input_dataset = []

    # Use average values for a given snapshot id if the data for dependent variable is not available.
    # Since the life events span over a period of time, add one row each day of life event.
    for i in merged_data.values:
        end_date = datetime.datetime.strptime(i[2], "%Y-%m-%d").date()
        start_date = datetime.datetime.strptime(i[1], "%Y-%m-%d").date()
        days = (end_date - start_date).days
        avg_stress = list(mean_value_df[(mean_value_df['snapshot_id'] == i[0])][dependent_variable].values)
        for j in range(days):
            day = start_date+timedelta(days=j)
            x = list(df_dailies[((df_dailies['snapshot_id'] == i[0]) & (df_dailies['day_time'] == day))][dependent_variable])

            stres = 0
            actual_stress = 0
            if(len(x) != 0):
                stres = x[0]
                actual_stress = stres
            else:
                stres = avg_stress[0]
            s_d = []
            s_d.extend(i)
            s_d.append(actual_stress)
            s_d.append(stres)
            s_d.append(day)
            final_input_dataset.append(s_d)
                
    final_input_dataset = pd.DataFrame(final_input_dataset, columns=[
        'snapshot_id', 'UpdatedBeginDate', 'UpdatedEndDate', 'sentiment','significance_label',  'LifeEventFamily', 'Anticipation', 'Intimacy',
       'Scope',        'age','gender', 'shipley.vocab', 'shipley.abs', 'openness', 'conscientiousness',
           'extraversion', 'agreeableness', 'neuroticism', 'pos.affect',
           'neg.affect', 'stai.trait', 'stress.d', 'stress.average', 'exact_day'])     

    X_input = final_input_dataset.drop(columns=['snapshot_id', 'UpdatedBeginDate', 'UpdatedEndDate', 'exact_day', 'stress.d'])
    X_input = X_input.dropna()
    X_input = X_input.rename(columns={"stress.average":"stress", "shipley.vocab": "shipley_vocab", "shipley.abs":"shipley_abs", "pos.affect":"pos_affect", "neg.affect":"neg_affect", "stai.trait":"stai_trait"})

    return X_input

In [10]:
def build_df_for_regression_social_media(df_social_media, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable):
    mean_value_df = mean_by_snapshot_id_df[['snapshot_id', dependent_variable]]
    merged_data = pd.merge(df_social_media, df_demographics, how="inner", on=["snapshot_id"])

    final_input_dataset = []

    #Use average values for a given snapshot id if the data for dependent variable is not available.
    for i in merged_data.values:
        day = i[2]
        avg_stress = list(mean_value_df[(mean_value_df['snapshot_id'] == i[0])][dependent_variable].values)

        x = list(df_dailies[((df_dailies['snapshot_id'] == i[0]) & (df_dailies['day_time'] == day))][dependent_variable])
        stres = 0
        actual_stress = 0
        if(len(x) != 0):
            stres = x[0]
            actual_stress = stres
        else:
            stres = avg_stress[0]
        s_d = []
        s_d.extend(i)
        s_d.append(actual_stress)
        s_d.append(stres)
        s_d.append(day)
        final_input_dataset.append(s_d)
        
    final_input_dataset = pd.DataFrame(final_input_dataset, columns=[
        'snapshot_id', 'created_date','sentiment', 'LifeEventFamily',
       'Anticipation', 'Intimacy', 'Scope', 'age','gender', 'shipley.vocab', 'shipley.abs', 'openness', 'conscientiousness',
           'extraversion', 'agreeableness', 'neuroticism', 'pos.affect',
           'neg.affect', 'stai.trait', 'stress.d', 'stress.average', 'exact_day'])     
    
    X_input = final_input_dataset.drop(columns=['snapshot_id', 'created_date', 'exact_day', 'stress.d'])
    X_input = X_input.dropna()
    X_input = X_input.rename(columns={"stress.average":"stress", "shipley.vocab": "shipley_vocab", "shipley.abs":"shipley_abs", "pos.affect":"pos_affect", "neg.affect":"neg_affect", "stai.trait":"stai_trait"})
    
    return X_input

In [11]:
def regression_life_events(df_life_events, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable):
    X_input = build_df_for_regression_life_events(df_life_events, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable)
    print("Total Rows: ", len(X_input.values))

    mod = smf.ols(formula='stress~age + shipley_vocab + shipley_abs + openness + conscientiousness + extraversion + agreeableness + neuroticism + pos_affect + neg_affect + stai_trait + sentiment + Anticipation + Intimacy + Scope + LifeEventFamily + gender + significance_label', data=X_input)
    res = mod.fit()
    return res.summary()

In [12]:
def regression_social_media_events(df_social_media, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable):
    X_input = build_df_for_regression_social_media(df_social_media, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable)
    print("Total Rows: ", len(X_input.values))

    mod = smf.ols(formula='stress~age + shipley_vocab + shipley_abs + openness + conscientiousness + extraversion + agreeableness + neuroticism + pos_affect + neg_affect + stai_trait + sentiment + Anticipation + Intimacy + Scope + LifeEventFamily + gender', data=X_input)
    res = mod.fit()
    return res.summary()

In [13]:
def regression_combined_survey_social_media(df_life_events, df_social_media, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable):
    df_survey = build_df_for_regression_life_events(df_life_events, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable)
    df_social_media = build_df_for_regression_social_media(df_social_media, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable)
    
    df_survey['Data_Type'] = 'Survey'
    df_survey = df_survey[['sentiment', 'LifeEventFamily', 'Anticipation', 'Intimacy', 'Scope',
                           'age', 'gender', 'shipley_vocab', 'shipley_abs',
                           'openness', 'conscientiousness', 'extraversion', 'agreeableness',
                           'neuroticism', 'pos_affect', 'neg_affect', 'stai_trait', 'stress', 'Data_Type']]
    df_social_media['Data_Type'] = 'Social Media'
    df_social_media = df_social_media[['sentiment', 'LifeEventFamily', 'Anticipation', 'Intimacy', 'Scope',
                           'age', 'gender', 'shipley_vocab', 'shipley_abs',
                           'openness', 'conscientiousness', 'extraversion', 'agreeableness',
                           'neuroticism', 'pos_affect', 'neg_affect', 'stai_trait', 'stress', 'Data_Type']]
    
    X_input = pd.concat([df_survey, df_social_media])

    print("Total Rows: ", len(X_input.values))
    mod = smf.ols(formula='stress~age + shipley_vocab + shipley_abs + openness + conscientiousness + extraversion + agreeableness + neuroticism + pos_affect + neg_affect + stai_trait + sentiment + Anticipation + Intimacy + Scope + LifeEventFamily + gender + Data_Type', data=X_input)
    res = mod.fit()
    return res.summary()

In [14]:
def linear_regression(dependent_variable):
    #Load all the data
    df_dailies = load_dailies_data()
    mean_by_snapshot_id_df = calculate_mean_dailies_data(df_dailies, dependent_variable)
    
    df_social_media = merge_social_media_data_with_categories(load_social_media_data(), load_social_media_categories())
    
    df_life_events = load_life_events_data()
    df_self_report_categories = load_self_reported_categories()
    df_life_events_with_categories = merge_life_event_with_reported_categories(df_life_events, df_self_report_categories)
    
    df_demographics = load_demographics_data()
        
    #Run regression
    print("Regression based on life events!")
    print(regression_life_events(df_life_events_with_categories, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable))
    print()
    print("Regression using Social Media Events!")
    print(regression_social_media_events(df_social_media, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable))
    print()
    print("Regression using combined survey and social media data!")
    print(regression_combined_survey_social_media(df_life_events_with_categories, df_social_media, df_demographics, mean_by_snapshot_id_df, df_dailies, dependent_variable))

In [15]:
linear_regression('stress.d')

Regression based on life events!
Total Rows:  17087
                            OLS Regression Results                            
Dep. Variable:                 stress   R-squared:                       0.223
Model:                            OLS   Adj. R-squared:                  0.222
Method:                 Least Squares   F-statistic:                     223.0
Date:                Tue, 06 Sep 2022   Prob (F-statistic):               0.00
Time:                        22:24:59   Log-Likelihood:                -12536.
No. Observations:               17087   AIC:                         2.512e+04
Df Residuals:                   17064   BIC:                         2.530e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------

Total Rows:  19067
                            OLS Regression Results                            
Dep. Variable:                 stress   R-squared:                       0.222
Model:                            OLS   Adj. R-squared:                  0.221
Method:                 Least Squares   F-statistic:                     246.9
Date:                Tue, 06 Sep 2022   Prob (F-statistic):               0.00
Time:                        22:25:58   Log-Likelihood:                -13799.
No. Observations:               19067   AIC:                         2.764e+04
Df Residuals:                   19044   BIC:                         2.783e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------

In [16]:
linear_regression('sleep.d')

Regression based on life events!
Total Rows:  16165
                            OLS Regression Results                            
Dep. Variable:                 stress   R-squared:                       0.225
Model:                            OLS   Adj. R-squared:                  0.224
Method:                 Least Squares   F-statistic:                     213.0
Date:                Tue, 06 Sep 2022   Prob (F-statistic):               0.00
Time:                        22:27:04   Log-Likelihood:                -13871.
No. Observations:               16165   AIC:                         2.779e+04
Df Residuals:                   16142   BIC:                         2.796e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------

Total Rows:  18145
                            OLS Regression Results                            
Dep. Variable:                 stress   R-squared:                       0.172
Model:                            OLS   Adj. R-squared:                  0.171
Method:                 Least Squares   F-statistic:                     171.7
Date:                Tue, 06 Sep 2022   Prob (F-statistic):               0.00
Time:                        22:28:03   Log-Likelihood:                -16248.
No. Observations:               18145   AIC:                         3.254e+04
Df Residuals:                   18122   BIC:                         3.272e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------

In [17]:
linear_regression('anxiety.d')

Regression based on life events!
Total Rows:  17086
                            OLS Regression Results                            
Dep. Variable:                 stress   R-squared:                       0.320
Model:                            OLS   Adj. R-squared:                  0.319
Method:                 Least Squares   F-statistic:                     364.9
Date:                Tue, 06 Sep 2022   Prob (F-statistic):               0.00
Time:                        22:29:09   Log-Likelihood:                -10684.
No. Observations:               17086   AIC:                         2.141e+04
Df Residuals:                   17063   BIC:                         2.159e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------

Total Rows:  19066
                            OLS Regression Results                            
Dep. Variable:                 stress   R-squared:                       0.308
Model:                            OLS   Adj. R-squared:                  0.308
Method:                 Least Squares   F-statistic:                     385.8
Date:                Tue, 06 Sep 2022   Prob (F-statistic):               0.00
Time:                        22:30:09   Log-Likelihood:                -11783.
No. Observations:               19066   AIC:                         2.361e+04
Df Residuals:                   19043   BIC:                         2.379e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------