In [1]:
#import essentials
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
class data_generator:
  def __init__(self, path):
    # loading the base data
    self.data = pd.read_csv(path)
    self.columns = self.data.columns
  
    # initializing the "mental risk" graph
    self.data_graph = {
        'mentally healthy':['decreased life enjoyment','relationship difficulties','slightly disturbed'],
        'decreased life enjoyment' : ['anxiety','relationship difficulties','mentally healthy'],
        'slightly disturbed' : ['dissociation','loss of productivity','emotionally weak','mentally healthy'],
        'emotionally weak' : ['relationship difficulties','depression','slightly disturbed','mentally healthy'],
        'relationship difficulties' : ['self harm','violence','dissociation'],
        'anxiety' : ['depression','insomnia','self harm','violence','burnout'],
        'depression' : ['anxiety','self harm','dissociation','loss of productivity'],
        'loss of productivity' : ['depression','insomnia','burnout'],
        'insomnia' : ['anxiety','burnout','loss of productivity'],
        'burnout' : ['insomnia','anxiety','loss of productivity'],
        'dissociation' : ['depression','relationship difficulties'],
        'self harm' : ['relationship difficulties','depression','anxiety'],
        'violence' : ['anxiety','relationship difficulties']
    }

  def __get_list__(self,string,bounds):
    lst = [w for w in string.split(",")]
    s = ""
    count = np.random.randint(bounds[0],bounds[1])

    try:
      for i in range(0,count):
        s += np.random.choice(lst)+","
    except IndexError:
       s += np.random.choice(lst)+","

    return s[:-1]

  # get risk based on previous risks  
  def __get_risk__(self,prev_df,row,target):
    prev_risk = prev_df.iloc[row][target]
    return np.random.choice(self.data_graph[prev_risk])

  def get_data(self, num_entries=500, num_years=5, target='Mental Risks'):
    output = list()

    for year in range(num_years):
      df = pd.DataFrame(None, index = range(0,num_entries), columns = self.data.columns)

      # initializing the "Mental Risk" column
      for row in range(num_entries):
        if year == 0:
          risk = np.random.choice(self.data[target])
        else:
          risk = self.__get_risk__(output[year - 1],row,target)
        
        df.iloc[row][target] = risk
      
        # choose features from risk
        i = 0

        while i < len(self.data):
          if risk == self.data.iloc[i][target]:
            break
          i += 1
          
        for col in self.data.columns[:-1]:
          bounds = [2,5]
          df.iloc[row][col] = self.__get_list__(self.data.iloc[i][col],bounds)
        
      output.append(df)

    return output # return list of generated dataframes



In [3]:
generator = data_generator("/content/Mental_health_base.csv")
output = generator.get_data()


In [4]:
# save the dataframes
for i, df in zip(range(len(output)), output):
  df.to_csv("mental_health_data_year-"+str(i+1)+".csv",index=False)