In [13]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [14]:
depression_df = pd.read_csv('../csv_files/clear_user_activity_d.csv')

In [15]:
depression_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423839 entries, 0 to 423838
Data columns (total 8 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   User ID                  423839 non-null  object 
 1   # of posts               423839 non-null  int64  
 2   Post raw word count      423839 non-null  int64  
 3   # of comments            423839 non-null  int64  
 4   Comment raw word count   423839 non-null  int64  
 5   Words total              423839 non-null  int64  
 6   Avg time b/w activities  423839 non-null  float64
 7   In r/SuicideWatch        423839 non-null  object 
dtypes: float64(1), int64(5), object(2)
memory usage: 25.9+ MB


### Drop User ID

In [16]:
X = depression_df.drop(columns="User ID", axis=1)
X.head(5)

Unnamed: 0,# of posts,Post raw word count,# of comments,Comment raw word count,Words total,Avg time b/w activities,In r/SuicideWatch
0,2,339,6,809,1148,206716.25,n
1,1,81,0,0,81,0.0,n
2,1,738,2,164,902,20369.66667,n
3,1,98,2,25,123,78247.33333,n
4,1,174,3,158,332,10011.0,n


### Encode "In r/SuicideWatch" 
- this will be target

In [17]:
encoder = LabelEncoder()

X_sw_encoded = encoder.fit_transform(X["In r/SuicideWatch"])

label_mapping = dict(zip(encoder.classes_, range(len(encoder.classes_))))
print("Label to Integer Mapping:")
print(label_mapping)

Label to Integer Mapping:
{'n': 0, 'y': 1}


### Scale Numerical Columns

In [18]:
X_numerical_col = depression_df.select_dtypes(include=["int64", "float64"]).columns.tolist()
print(X_numerical_col)

scalar = StandardScaler()
X_num_scaled = scalar.fit_transform(X[X_numerical_col])

['# of posts', 'Post raw word count', '# of comments', 'Comment raw word count', 'Words total', 'Avg time b/w activities']


In [19]:
X_num_scaled_df = pd.DataFrame(X_num_scaled, columns=X_numerical_col)
print(X_num_scaled_df.head(5))

   # of posts  Post raw word count  # of comments  Comment raw word count  \
0    0.371954             0.285386       0.023626                0.249286   
1    0.009223            -0.254945      -0.213570               -0.169573   
2    0.009223             1.121013      -0.134505               -0.084662   
3    0.009223            -0.219342      -0.134505               -0.156629   
4    0.009223            -0.060174      -0.094972               -0.087769   

   Words total  Avg time b/w activities  
0     0.294727                -0.242376  
1    -0.214338                -0.289223  
2     0.177361                -0.284607  
3    -0.194299                -0.271491  
4    -0.094586                -0.286955  


In [20]:
X_sw_encoded_df = pd.DataFrame(X_sw_encoded, columns=["In r/SuicideWatch"])

X_sw_encoded_df.head(10)

Unnamed: 0,In r/SuicideWatch
0,0
1,0
2,0
3,0
4,0
5,0
6,1
7,0
8,0
9,0


In [21]:
X_merged = pd.concat([X_num_scaled_df, X_sw_encoded_df], axis=1)

X_merged.head(10)

Unnamed: 0,# of posts,Post raw word count,# of comments,Comment raw word count,Words total,Avg time b/w activities,In r/SuicideWatch
0,0.371954,0.285386,0.023626,0.249286,0.294727,-0.242376,0
1,0.009223,-0.254945,-0.21357,-0.169573,-0.214338,-0.289223,0
2,0.009223,1.121013,-0.134505,-0.084662,0.177361,-0.284607,0
3,0.009223,-0.219342,-0.134505,-0.156629,-0.194299,-0.271491,0
4,0.009223,-0.060174,-0.094972,-0.087769,-0.094586,-0.286955,0
5,0.009223,-0.198399,-0.174037,-0.162325,-0.194777,-0.285685,0
6,3.636531,5.341036,2.276987,1.857935,2.92879,-0.246332,1
7,0.009223,-0.227719,-0.055439,-0.144721,-0.185235,-0.28834,0
8,0.009223,-0.160701,-0.134505,-0.126082,-0.152792,-0.28586,0
9,0.371954,-0.125098,-0.094972,-0.161807,-0.177601,-0.268666,0


In [22]:
X = X_merged.drop(columns="In r/SuicideWatch")
y = X_merged["In r/SuicideWatch"]

In [23]:
import pickle

with open('data.pk1', 'wb') as f:
    pickle.dump((X, y), f)

# import ^ with the following:

# import pickle
# with open('data.pk1', 'rb') as f:
#     X, y = pickle.load(f)