In [14]:
import pandas as pd

### Inspect Data

In [15]:
depression_df = pd.read_csv('../csv_files/user_activity_summary_d.csv')

depression_df.head(5)

Unnamed: 0,User ID,# of posts,Post raw word count,# of comments,Comment raw word count,Words total,Avg time b/w activities,In r/SuicideWatch
0,[deleted],211324,18814454,489114,6980379,25794833,442.94764,y
1,OAThrowaway,2,339,6,809,1148,206716.25,n
2,whitefirebird,1,81,0,0,81,0.0,n
3,throwaway45678,1,738,2,164,902,20369.66667,n
4,fingernailing,1,98,2,25,123,78247.33333,n


In [16]:
data_types = depression_df.dtypes
print(data_types)

User ID                     object
# of posts                   int64
Post raw word count          int64
# of comments                int64
Comment raw word count       int64
Words total                  int64
Avg time b/w activities    float64
In r/SuicideWatch           object
dtype: object


In [17]:
summary_stats = depression_df.describe().round(2)
summary_stats

Unnamed: 0,# of posts,Post raw word count,# of comments,Comment raw word count,Words total,Avg time b/w activities
count,430684.0,430684.0,430684.0,430684.0,430684.0,430684.0
mean,1.47,243.22,6.53,343.09,586.31,1278098.0
std,322.02,28672.6,745.74,10815.7,39362.1,4422852.0
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,7.0,64.0,0.0
50%,1.0,8.0,1.0,66.0,194.0,767.33
75%,1.0,239.0,4.0,227.0,484.0,379359.5
max,211324.0,18814454.0,489114.0,6980379.0,25794833.0,111955600.0


In [18]:
categorical_cols = depression_df.select_dtypes(include="object")
print(categorical_cols.columns)

user_id_unique = categorical_cols['User ID'].is_unique
print("User ID unique:", user_id_unique)
print()

sw_counts = categorical_cols['In r/SuicideWatch'].value_counts()
print(sw_counts)

Index(['User ID', 'In r/SuicideWatch'], dtype='object')
User ID unique: True

In r/SuicideWatch
n    377845
y     52839
Name: count, dtype: int64


In [19]:
''' 
Missing Data - none missing
'''
missing_data = depression_df.isnull().sum()
print(missing_data)

User ID                    0
# of posts                 0
Post raw word count        0
# of comments              0
Comment raw word count     0
Words total                0
Avg time b/w activities    0
In r/SuicideWatch          0
dtype: int64


In [20]:
'''
Look through 0 activty --> 0 for all column except User ID & In r/suicideWatch
(so look through numerical data)
'''
numerical_data = depression_df.select_dtypes(["int64", "float64"])

zero_activity_data = (numerical_data == 0).all(axis=1).sum()

print("Number of Zero Activity:", zero_activity_data)

Number of Zero Activity: 0


In [21]:
''' 
Checking if # of 0 post wc == # of 0 posts
'''
zero_posts = (depression_df["# of posts"] == 0).sum()
zero_post_wc = (depression_df["Post raw word count"] == 0).sum()

print("Total users with 0 posts: ", zero_posts)
print("Total users with 0 post wc: ", zero_post_wc)

Total users with 0 posts:  199137
Total users with 0 post wc:  205784


In [22]:
''' 
There are some users that have 1 post but 0 post raw word count
'''
check_activity = numerical_data.drop("# of posts", axis=1)

zero_activity_data = (check_activity == 0).all(axis=1)

print("Number of Zero Activity (w/o # of posts):", zero_activity_data.sum())

zero_rows = depression_df[zero_activity_data]

print(zero_rows.head(3))

Number of Zero Activity (w/o # of posts): 3244
             User ID  # of posts  Post raw word count  # of comments  \
9   newdayfreshstart           1                    0              0   
35    KarmaPolice777           1                    0              0   
41            AlfaFu           1                    0              0   

    Comment raw word count  Words total  Avg time b/w activities  \
9                        0            0                      0.0   
35                       0            0                      0.0   
41                       0            0                      0.0   

   In r/SuicideWatch  
9                  n  
35                 n  
41                 n  


In [23]:
'''
There are some users that have 1 comment but 0 comment raw word count
'''

check_activity = numerical_data.drop("# of comments", axis=1)

zero_activity_data = (check_activity == 0).all(axis=1)

print("Number of Zero Activity (w/o # of comments):", zero_activity_data.sum())

zero_rows = depression_df[zero_activity_data]

print(zero_rows.head(3))


Number of Zero Activity (w/o # of comments): 11
                User ID  # of posts  Post raw word count  # of comments  \
301535        Poodle448           0                    0              1   
308324     LostBreezy24           0                    0              1   
318901  itzlinknotzelda           0                    0              1   

        Comment raw word count  Words total  Avg time b/w activities  \
301535                       0            0                      0.0   
308324                       0            0                      0.0   
318901                       0            0                      0.0   

       In r/SuicideWatch  
301535                 n  
308324                 n  
318901                 n  


In [24]:
''' 
Therefore, I am going to drop rows with the following condition:
# of posts > Post raw word count
# of comments > Comments raw word count

Note: dropping it based off of Word total == 0 will not clean the data. 
e.g: Insecure_Young_Boy,1,0,1,1,1,713.0,n
'''

# we want to keep data that has "# of posts" <= "Post raw word count"
posts_condition = depression_df["# of posts"] <= depression_df["Post raw word count"]
comments_condition = depression_df["# of comments"] <= depression_df["Comment raw word count"]

rows_to_delete = depression_df[posts_condition & comments_condition].index

depression_df = depression_df.drop(index=rows_to_delete).reset_index(drop=True)


### Results
Based on the result above, I think it is safe to drop rows based on 1) # of posts > Post raw word count and 2) # of comments > Comments raw word count to remove no user activity (and I guess incorrect data?) - lmk what you think

In [25]:
''' 
Percentage of '0' in each column after cleaning:
0 posts should equal 0 raw word count
0 comments should equal 0 raw word count
'''
int_columns = ["# of posts", "Post raw word count", "# of comments", "Comment raw word count", "Words total", "Avg time b/w activities"]

for column in int_columns:
    sum_zero_column = (filtered_df[column] == 0).sum()
    zero_col_per = (sum_zero_column / len(filtered_df)) * 100
    print(f"{column}: {zero_col_per:.2f}%")


# of posts: 46.98%
Post raw word count: 46.98%
# of comments: 19.94%
Comment raw word count: 19.94%
Words total: 0.00%
Avg time b/w activities: 42.30%


# Outliers

I don't think we should remove outliers since they are important for our models and also since our data is pretty limited.

In [26]:
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR


    lower_outliers = df[df[column] < lower_bound]
    upper_outliers = df[df[column] > upper_bound]
    # outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    outliers = pd.concat([lower_outliers, upper_outliers], axis=0)
    
    return outliers, lower_outliers, upper_outliers, lower_bound, upper_bound


# Step 2: Detect outliers for each numerical column
outliers_dict = {}
for col in numerical_data:
    outliers, *_ = detect_outliers_iqr(filtered_df, col)
    if not outliers.empty:  # Only store columns with outliers
        outliers_dict[col] = outliers

# Step 3: Display the outliers for each numerical column
if outliers_dict:
    print("Using IQR Rule we found outliers in:")
    for col, outliers in outliers_dict.items():
        print(f"  '{col}': {len(outliers)} entries")
else:
    print("No outliers detected in any numerical columns.")

Using IQR Rule we found outliers in:
  '# of posts': 28891 entries
  'Post raw word count': 37054 entries
  '# of comments': 49651 entries
  'Comment raw word count': 48242 entries
  'Words total': 40504 entries
  'Avg time b/w activities': 80972 entries


### Note: Used data-clearning.ipynb instead for creating csv