In [None]:
!pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.11-py3-none-any.whl.metadata (32 kB)
Downloading kagglehub-0.3.11-py3-none-any.whl (63 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.11

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
muhammadroshaanriaz_time_wasters_on_social_media_path = kagglehub.dataset_download('muhammadroshaanriaz/time-wasters-on-social-media')

print('Data source import complete.')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Downloading from https://www.kaggle.com/api/v1/datasets/download/muhammadroshaanriaz/time-wasters-on-social-media?dataset_version_number=1...


100%|██████████| 36.0k/36.0k [00:00<00:00, 278kB/s]

Extracting files...
Data source import complete.





In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('/root/.cache/kagglehub/datasets/muhammadroshaanriaz/time-wasters-on-social-media/versions/1/Time-Wasters on Social Media.csv')
print('### first 5 lines ###','\n')
df.head()



In [None]:
# helper func
def convertHours(value):
    try:
        return float(value)
    except ValueError:
        if isinstance(value, str):
            numbers = [float(num) for num in value.split() if num.replace('.', '', 1).isdigit()]
            if numbers:
                return sum(numbers) / len(numbers)
        return 0

def preprocess(filepath):
    df = pd.read_csv(filepath)
    df = df.dropna()
    df['num_platforms'] = df['7. What social media platforms do you commonly use?'].apply(lambda x: len(str(x).split(',')))
    df['target'] = df['8. What is the average time you spend on social media every day?'].apply(lambda x: 1 if convertHours(x) > 3 else 0)

    X = df[['num_platforms', 
            '9. How often do you find yourself using Social media without a specific purpose?', 
            '12. On a scale of 1 to 5, how easily distracted are you?', 
            '14. Do you find it difficult to concentrate on things?']].values
    y = df['target'].values

    # split training + validation & testing (85/15)
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    # split: training & validation (85/15 from 85, 0.1765 * 85% ≈ 15% of total)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    X_val = torch.tensor(X_val, dtype=torch.float32)
    y_val = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
        self.dropout = nn.Dropout(p=0.3)
    def forward(self, x):
        x = self.dropout(x)
        x = self.linear(x)
        return torch.sigmoid(x)

In [None]:
def trainModel(filepath, epochs=50, batch_size=32, learning_rate=0.01):
    # preprocess data
    X_train, X_val, X_test, y_train, y_val, y_test = preprocess(filepath)
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    model = LogisticRegressionModel(X_train.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)

    torch.save((X_test, y_test), './models/test_data.pth')

    for epoch in range(epochs):
        model.train()
        for inputs, labels in train_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)
            val_loss = criterion(val_outputs, y_val)

    torch.save(model.state_dict(), './models/productivity_model.pth')

In [None]:
def testModel(filepath):
    model = LogisticRegressionModel(input_dim=4)
    model.load_state_dict(torch.load('./models/productivity_model.pth'))
    model.eval()

    X_test, y_test = torch.load('./models/test_data.pth')

    with torch.no_grad():
        outputs = model(X_test)
        predictions = (outputs >= 0.5).float()

        # baseline value predictions
        baseline_prediction_productivity_loss = 4.80 
        baseline_prediction_addiction_level = 7.16

        # use the baseline for MSE comparison
        baseline_productivity_loss = np.full_like(y_test.numpy(), baseline_prediction_productivity_loss)
        baseline_addiction_level = np.full_like(y_test.numpy(), baseline_prediction_addiction_level)

        # calculate MSE for comparison to baseline
        mse_productivity_loss = mean_squared_error(y_test.numpy(), baseline_productivity_loss)
        mse_addiction_level = mean_squared_error(y_test.numpy(), baseline_addiction_level)

        # calculate MSE for model's predictions
        mse_model = mean_squared_error(y_test.numpy(), outputs.numpy())

        # accuracy
        accuracy = (predictions.eq(y_test).sum() / y_test.shape[0]).item()

        # precision
        precision = precision_score(y_test.numpy(), predictions.numpy())

        # recall
        recall = recall_score(y_test.numpy(), predictions.numpy())

        # F1-score
        f1 = f1_score(y_test.numpy(), predictions.numpy())
        
        print(f'Mean Squared Error (Model): {mse_model:.4f}')

In [None]:
# Run the full pipeline
X_train, X_val, X_test, y_train, y_val, y_test, y_test_numpy = preprocess(df)
model = trainModel(X_train, X_val, y_train, y_val, input_dim=X_train.shape[1])
testModel(model, X_test, y_test, y_test_numpy)