In [None]:
#Importing the necesaary libraries
import pandas as pd
import numpy as np

##**Original Dataset**

In [None]:
#Reading the original dataset
data = pd.read_csv('/content/drive/MyDrive/DAB304/post natal data.csv')
data.head()

In [None]:
#Getting the basic information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503 entries, 0 to 1502
Data columns (total 11 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   Timestamp                                  1503 non-null   object
 1   Age                                        1503 non-null   object
 2   Feeling sad or Tearful                     1503 non-null   object
 3   Irritable towards baby & partner           1497 non-null   object
 4   Trouble sleeping at night                  1503 non-null   object
 5   Problems concentrating or making decision  1491 non-null   object
 6   Overeating or loss of appetite             1503 non-null   object
 7   Feeling anxious                            1503 non-null   object
 8   Feeling of guilt                           1494 non-null   object
 9   Problems of bonding with baby              1503 non-null   object
 10  Suicide attempt                     

In [None]:
#Dropping the timestamp column from the dataset
data = data.drop('Timestamp', axis=1)
data.columns

In [None]:
#Getting a basic idea of the dataset
data.describe()

Unnamed: 0,Timestamp,Age,Feeling sad or Tearful,Irritable towards baby & partner,Trouble sleeping at night,Problems concentrating or making decision,Overeating or loss of appetite,Feeling anxious,Feeling of guilt,Problems of bonding with baby,Suicide attempt
count,1503,1503,1503,1497,1503,1491,1503,1503,1494,1503,1503
unique,90,5,3,3,3,3,3,2,3,3,3
top,6/15/2022 22:24,40-45,Yes,Yes,Two or more days a week,No,No,Yes,No,No,No
freq,51,364,536,555,640,583,841,980,624,557,709


##**Data Augmentation using Bootstrapping**

In [None]:
# Number of bootstrap samples to generate
n_bootstrap_samples = 100

# Creating an empty DataFrame to store bootstrapped samples
bootstrapped_data = pd.DataFrame()

# Performing bootstrapping
for _ in range(n_bootstrap_samples):
    # Sample with replacement from the original DataFrame
    bootstrap_sample = data.sample(n=len(data), replace=True)
    # Append the bootstrapped sample to the bootstrapped_data DataFrame
    bootstrapped_data = bootstrapped_data.append(bootstrap_sample, ignore_index=True)

# Printing the shape of the bootstrapped data to verify the increase in the number of records
print("Shape of bootstrapped data:", bootstrapped_data.shape)

  bootstrapped_data = bootstrapped_data.append(bootstrap_sample, ignore_index=True)


Shape of bootstrapped data: (120240, 11)


In [None]:
#Checking all the columns in the dataset
bootstrapped.columns

##**Data Processing for Visualization**

In [None]:
# Counting missing values in each column
missing_values_count = bootstrapped.isna().sum()

# Printing the count of missing values
print("Count of missing values in each column:")
print(missing_values_count)

In [None]:
# Converting categorical columns to object type
for col in bootstrapped.select_dtypes(include='category').columns:
    bootstrapped[col] = bootstrapped[col].astype('object')

# Filling missing values with "Not Shared"
bootstrapped_data = bootstrapped.fillna("Unknown")

# Printing the filled DataFrame
bootstrapped_data.info()

In [None]:
# Check for missing values in bootstrapped_data
bootstrapped_data.isna().sum()

Timestamp                                      0
Age                                            0
Feeling sad or Tearful                         0
Irritable towards baby & partner             463
Trouble sleeping at night                      0
Problems concentrating or making decision    943
Overeating or loss of appetite                 0
Feeling anxious                                0
Feeling of guilt                             695
Problems of bonding with baby                  0
Suicide attempt                                0
dtype: int64

In [None]:
bootstrapped_data = bootstrapped_data.drop(['Timestamp'], axis=1)
bootstrapped_data.head()

Unnamed: 0,Age,Feeling sad or Tearful,Irritable towards baby & partner,Trouble sleeping at night,Problems concentrating or making decision,Overeating or loss of appetite,Feeling anxious,Feeling of guilt,Problems of bonding with baby,Suicide attempt
0,45-50,No,Sometimes,Yes,Often,No,Yes,No,No,No
1,45-50,Yes,No,Two or more days a week,No,No,Yes,Maybe,Sometimes,No
2,25-30,Sometimes,Yes,Two or more days a week,Yes,No,No,Maybe,No,Yes
3,40-45,No,Sometimes,Yes,No,No,No,Maybe,Sometimes,Yes
4,30-35,Yes,No,Two or more days a week,No,No,Yes,Maybe,Sometimes,No


In [None]:
#Creating 'patient_id' column
bootstrapped_data['patient_id'] = bootstrapped_data.index + 1
bootstrapped_data.head()

In [None]:
# bootstrapped_data = bootstrapped_data[['patient_id'] + [col for col in bootstrapped_data.columns if col != 'patient_id']]

In [None]:
bootstrapped_data.shape

(120240, 10)

In [None]:
bootstrapped_data.tail()

Unnamed: 0,Age,Feeling sad or Tearful,Irritable towards baby & partner,Trouble sleeping at night,Problems concentrating or making decision,Overeating or loss of appetite,Feeling anxious,Feeling of guilt,Problems of bonding with baby,Suicide attempt
120235,40-45,No,Yes,Two or more days a week,Yes,No,No,Yes,Yes,No
120236,35-40,No,Yes,Yes,,No,Yes,No,Yes,Yes
120237,40-45,No,Yes,Two or more days a week,No,No,Yes,No,No,Yes
120238,40-45,No,Yes,Two or more days a week,No,No,Yes,No,No,Yes
120239,40-45,Sometimes,Yes,No,Yes,No,Yes,No,Sometimes,Yes


In [None]:
bootstrapped_data.iloc[-1]

Age                                              40-45
Feeling sad or Tearful                       Sometimes
Irritable towards baby & partner                   Yes
Trouble sleeping at night                           No
Problems concentrating or making decision          Yes
Overeating or loss of appetite                      No
Feeling anxious                                    Yes
Feeling of guilt                                    No
Problems of bonding with baby                Sometimes
Suicide attempt                                    Yes
Name: 120239, dtype: object

In [None]:
# Saving the bootstrapped data to a CSV file
bootstrapped.to_csv('bootstrapped.csv', index=False)

bootstrapped.to_csv('path/bootstrapped.csv', index=False)

NameError: name 'bootstrapped' is not defined

##**Data Processing for Machine Learning**

In [None]:
#Replacing the spaces with '_' in column names
column_rename_mapping = {
    'Feeling sad or Tearful': 'Feeling_sad_or_Tearful',
    'Irritable towards baby & partner': 'Irritable_towards_baby_&_partner',
    'Trouble sleeping at night': 'Trouble_sleeping_at_night',
    'Problems concentrating or making decision': 'Problems_concentrating_or_making_decision',
    'Overeating or loss of appetite': 'Overeating_or_loss_of_appetite',
    'Feeling anxious': 'Feeling_anxious',
    'Feeling of guilt': 'Feeling_of_guilt',
    'Problems of bonding with baby': 'Problems_of_bonding_with_baby',
    'Suicide attempt': 'Suicide_attempt'
}

# Renaming columns using the rename() function
bootstrapped_data.rename(columns=column_rename_mapping, inplace=True)

In [None]:
bootstrapped_data.describe()

In [None]:
data_bs = pd.read_csv('/file_path/bstrapped_data.csv')
data_bs.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Creating a copy of the DataFrame
encoded_df = data_bs.copy()

# One-hot encode all columns except the target column
encoded_df = pd.get_dummies(encoded_df, columns=[col for col in encoded_df.columns if col != 'Suicide attempt'])

# Initialize LabelEncoder for the target column
label_encoder = LabelEncoder()

# Encode the target column
encoded_df['Suicide attempt'] = label_encoder.fit_transform(encoded_df['Suicide attempt'])

# Print the encoded DataFrame
print(encoded_df)

##**Checking for Supervised Classifier Models compatible with our Dataset**

In [None]:
!pip install lazypredict

In [None]:
from lazypredict.Supervised import LazyClassifier

# Dropping non-numeric columns
X = encoded_df.drop(['Suicide_attempt'], axis=1)
y = encoded_df['Suicide_attempt']

# # Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initializing LazyClassifier
clsfier = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# # Fit LazyClassifier
models, predictions = clsfier.fit(X_train, X_test, y_train, y_test)

# # Print model performance
print(models)