## Imports

In [3]:
import torch

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    print('Using MPS')
    device = torch.device("mps")
elif torch.backends.cuda.is_built():
    print('Using CUDA')
    device = torch.device("cuda")
else:
    print('Using CPU')
    device = torch.device("cpu")

Using MPS


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

## Main Load

In [4]:
csv = pd.read_csv('data/user_data_public.csv', low_memory=False)

In [4]:
filtered_df = csv.loc[:, ~csv.columns.str.match(r'^q\d+')]

In [None]:
# f = filtered_df.d_country.value_counts().where(filtered_df.d_country.value_counts() > 100)

## A Little Experiment - Checking bodytype v sexual orientation correlations in Men

In [None]:
import numpy as np

# Filtering NA
bodytype_orientation = filtered_df[['d_bodytype', 'd_orientation', 'gender']].dropna()

# Filtering men only
bodytype_orientation = bodytype_orientation[
    (bodytype_orientation.gender == 'Man') & (bodytype_orientation.d_bodytype != 'Rather not say')].drop(
    columns=['gender'])

bodytype_orientation.d_orientation.unique()
# Distinguishing likely homosexual group from rest of men. Identified ['Gay', 'Gay, Queer', 'Gay, Sapiosexual'] as potential homosexual groups. Others  varied

potential_homosexual_groups = ['Gay', 'Gay, Queer']
# bodytype_orientation = bodytype_orientation[bodytype_orientation.d_orientation.isin(potential_homosexual_groups + ['Straight'])]

# Create the new column with 'h' for homosexual and 'o' for others
bodytype_orientation['d_orientation'] = np.where(
    bodytype_orientation['d_orientation'].isin(potential_homosexual_groups),
    'h',
    'o'
)
bodytype_orientation.value_counts()

In [None]:
(bodytype_orientation[
     bodytype_orientation.d_bodytype == 'Jacked'].d_orientation.value_counts() / bodytype_orientation.d_orientation.value_counts())

In [None]:
bodytype_orientation.d_orientation.value_counts()
# bodytype_orientation[bodytype_orientation.d_bodytype == 'Overwight'].value_counts() / bodytype_orientation.d_orientation.value_counts()

## Mixed Matchmaking

In [None]:
csv[csv.q71 == 'Yes'].race.value_counts() / csv[csv.q71.isin(['Yes', 'No'])].race.value_counts()

In [None]:
religions_csv = csv[['q71', 'd_religion_type', 'gender', 'race']].dropna(subset=['d_religion_type', 'q71'])
religions_csv = religions_csv[religions_csv['d_religion_type'] != '-']
religions_csv.d_religion_type.value_counts()

In [None]:
bad_idea_boolean = religions_csv.q71 == 'Yes'

In [None]:
religions_csv[bad_idea_boolean].d_religion_type.value_counts() / religions_csv.d_religion_type.value_counts()

## Police/Coutnry Safety

In [None]:
# q6109
safety = csv[['q6109', 'race', 'gender', 'd_country']].dropna(subset='q6109')

In [None]:
safety

## PCA

In [31]:
import random
from sklearn.preprocessing import StandardScaler

df = filtered_df.drop(columns=['CA', 'CA_items'])  # Remove uninterpreted columns
df = df.dropna(subset=['d_gender', 'd_age', 'p_ambi'])  # Core features

# Encode categoricals
cat_cols = ['d_astrology_sign', 'd_education_type', 'd_ethnicity']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Normalize personality traits
p_cols = [c for c in df.columns if c.startswith('p_')]
scaler = StandardScaler()
df[p_cols] = scaler.fit_transform(df[p_cols].fillna(df[p_cols].mean()))

In [33]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# PCA for visualization
pca = PCA(n_components=0.95)
pca_features = pca.fit_transform(df[p_cols])

# t-SNE for cluster separation
tsne = TSNE(n_components=2, perplexity=30)
tsne_results = tsne.fit_transform(pca_features)




In [34]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Determine optimal clusters
silhouette_scores = []
for k in range(2, 8):
    kmeans = KMeans(n_clusters=k)
    labels = kmeans.fit_predict(pca_features)
    silhouette_scores.append(silhouette_score(pca_features, labels))

# Final clustering
optimal_k = np.argmax(silhouette_scores) + 2
kmeans = KMeans(n_clusters=optimal_k)
df['cluster'] = kmeans.fit_predict(pca_features)


In [35]:
# Cluster characteristics
cluster_profiles = df.groupby
cluster_profiles

<bound method DataFrame.groupby of              p_conf        d_astrology_seriosity    p_laidback  \
3     -6.539848e-16                          NaN  9.601352e-16   
5     -6.539848e-16        but it doesn't matter  9.601352e-16   
6     -6.539848e-16                          NaN  9.601352e-16   
22    -6.539848e-16  and it's fun to think about  9.601352e-16   
23    -6.539848e-16                          NaN  9.601352e-16   
...             ...                          ...           ...   
68363 -6.539848e-16  and it's fun to think about -4.200165e-02   
68364 -6.539848e-16  and it's fun to think about  9.601352e-16   
68365 -6.539848e-16                          NaN -2.879669e+00   
68369 -6.539848e-16                          NaN  9.601352e-16   
68370 -6.539848e-16                          NaN  9.601352e-16   

      d_education_phase  p_drug  \
3            Working on     0.0   
5        Dropped out of     0.0   
6            Working on     0.0   
22                  NaN     0.0 

## Top Questions

In [5]:
import pandas as pd

# Read the data
question_csv = pd.read_csv('data/question_data.csv', delimiter=';', low_memory=False)


In [6]:
# Get high-response questions
question_cols = list(
    filter(lambda st: st[0] == 'q', question_csv.loc[question_csv.N > 45000, 'question'].dropna().tolist()))

# Drop rows in csv with any NaNs in selected question columns
answerers = csv.dropna(subset=question_cols)

# Subset the answer columns
answers_raw = answerers[question_cols]

# One-hot encode categorical answers
answers_encoded = pd.get_dummies(answers_raw, columns=question_cols)

# Add age as target
answers_encoded['d_age'] = answerers.loc[answers_encoded.index, 'd_age']

In [7]:
# question_cols = list(filter(lambda st: st[0] == 'q', question_csv.loc[question_csv.N > 45000, 'question'].dropna().tolist()))
# question_cols

In [8]:
# answerers = csv.dropna(subset=question_cols)
# answers = answerers[question_cols + ['d_age']]
# answerers

In [17]:
from sklearn.model_selection import train_test_split
# Drop rows with NaNs in features or target before splitting
clean_data = answers_encoded.dropna(subset=answers_encoded.columns.tolist() + ['d_age'])

X = clean_data.drop(columns='d_age')
y = clean_data['d_age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np


# Custom dataset class
class QuestionDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X.values)
        self.y = torch.FloatTensor(y.values)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# Neural network model
class AgePredictor(nn.Module):
    def __init__(self, input_size):
        super(AgePredictor, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return torch.clamp(self.layers(x), min=1, max=100)


# Create datasets
train_dataset = QuestionDataset(X_train, y_train)
test_dataset = QuestionDataset(X_test, y_test)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

# Initialize model, loss and optimizer
model = AgePredictor(X_train.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# Training loop

num_epochs = 10
# Compute total steps
total_steps = num_epochs * (len(train_loader) + len(test_loader))
progress_bar = tqdm(total=total_steps, desc="Training Progress", leave=False)
for epoch in range(num_epochs):
    model.train()
    train_loss_total = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()

        train_loss_total += loss.item()
        progress_bar.update(1)
        progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs} [Train]")
        progress_bar.set_postfix(loss=loss.item())

    model.eval()
    val_loss_total = 0.0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets).item()
            val_loss_total += loss
            progress_bar.update(1)
            progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs} [Val]")
            progress_bar.set_postfix(loss=loss)

    # Print epoch summary
    print(f"Epoch {epoch+1}/{num_epochs} - "
          f"Train Loss: {train_loss_total / len(train_loader):.4f} | "
          f"Val Loss: {val_loss_total / len(test_loader):.4f}")

progress_bar.close()



Training Progress:   0%|          | 0/2590 [00:00<?, ?it/s]

Epoch 1/10 - Train Loss: 1149.3093 | Val Loss: 1143.2173
Epoch 2/10 - Train Loss: 1149.4158 | Val Loss: 1143.2173
Epoch 3/10 - Train Loss: 1149.2404 | Val Loss: 1143.2173
Epoch 4/10 - Train Loss: 1149.3892 | Val Loss: 1143.2173
Epoch 5/10 - Train Loss: 1149.2692 | Val Loss: 1143.2173
Epoch 6/10 - Train Loss: 1149.4427 | Val Loss: 1143.2173
Epoch 7/10 - Train Loss: 1149.4207 | Val Loss: 1143.2173
Epoch 8/10 - Train Loss: 1149.3983 | Val Loss: 1143.2173
Epoch 9/10 - Train Loss: 1149.3244 | Val Loss: 1143.2173
Epoch 10/10 - Train Loss: 1149.2060 | Val Loss: 1143.2173


In [19]:
print(np.isnan(X_train.values).sum(), np.isinf(X_train.values).sum())
print(np.isnan(y_train.values).sum(), np.isinf(y_train.values).sum())

0 0
0 0


In [20]:
print(y_train.describe())

count    13236.000000
mean        33.964037
std          7.919881
min         18.000000
25%         28.000000
50%         33.000000
75%         38.000000
max        100.000000
Name: d_age, dtype: float64


In [22]:
model.eval()
model(answers_encoded[0::])

TypeError: linear(): argument 'input' (position 1) must be Tensor, not DataFrame

In [26]:
train_loader.sampler

<torch.utils.data.sampler.RandomSampler at 0x7feb903f7520>