# Replication of Table 2

This notebook is trying to replicate Table 2 in Kosinksi's paper, i.e. train 

In [27]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier

In [26]:
from data.load_data import load_faces, load_features

features = load_features()
faces = load_faces()

In [28]:
np.random.seed(67)

def get_sample_and_label(row):
    for i in ground_truth:
        if ~np.isnan(row[i]):
            return i, row[i]
    print(f"No label found for userid: {row['userid']}")

def get_not_nan_labels(colname):
    col = faces[colname]
    return col[col.notna()]

def get_binary_accuracy(preds, actual):
    num_of_correct_predictions = (preds == actual).sum()
    accuracy = num_of_correct_predictions / len(actual)
    return accuracy

ground_truth = ['pol_dat_us', 'pol_dat_ca', 'pol_dat_uk', 'pol_fb_us']
samples = {k: get_not_nan_labels(k) for k in ground_truth}

In [42]:
from tqdm import tqdm, trange

max_chunksize = len(features) // 4

def create_model():
    # alpha 0.0001 penalty  seems to be optimal
    params = {
        "alpha": 0.0001, "penalty": 'l2',"loss": 'log'
    }
    
    
    return SGDClassifier(**params)    


models = {k: create_model() for k in ground_truth}

with tqdm(ground_truth) as t:
    for sample_name in t:
        t.set_description(f'Sample {sample_name}')
        model = models[sample_name]

        # get indexes and values for each sample
        sample = samples[sample_name]
        indexes = np.array(sample.index) - 1
        values = np.array(sample.values)
        
        
        num_of_batches = len(indexes) // max_chunksize
        
        if num_of_batches == 0:
            y = values
            X = features[indexes]
            model.fit(X, y)
        else:
            for i in range(num_of_batches):
                batch_idx = indexes[max_chunksize * i : max_chunksize * (i + 1)]
                y = values[max_chunksize * i : max_chunksize * (i + 1)]
                X = features[batch_idx]
                model.partial_fit(X, y, classes=[0,1])
        
        models[sample_name] = model

Sample pol_fb_us: 100%|███████████████████████████| 4/4 [04:59<00:00, 74.89s/it]


In [43]:
models

{'pol_dat_us': SGDClassifier(loss='log'),
 'pol_dat_ca': SGDClassifier(loss='log'),
 'pol_dat_uk': SGDClassifier(loss='log'),
 'pol_fb_us': SGDClassifier(loss='log')}

In [44]:
acc_matrix = np.zeros([4,4])

with tqdm(enumerate(ground_truth)) as t:
    for i, sample_name in enumerate(ground_truth):
        t.set_description(f'Sample {sample_name}')
        model = models[sample_name]
        
        for j, test_sample in enumerate(ground_truth):
            t.set_postfix({'imputing':test_sample})
            
            sample = samples[test_sample]
            indexes = np.array(sample.index) - 1
            values = np.array(sample.values)
            
            X = features[indexes]
            y_pred = model.predict(X)
            
            acc = get_binary_accuracy(y_pred, values)
            
            acc_matrix[i,j] = acc


Sample pol_fb_us: : 0it [01:50, ?it/s, imputing=pol_fb_us]  


In [45]:
acc_matrix

array([[0.57808686, 0.54652722, 0.54647593, 0.6027051 ],
       [0.58666388, 0.68540866, 0.62155163, 0.64508693],
       [0.56384552, 0.60044397, 0.6419338 , 0.61759151],
       [0.60371478, 0.5866035 , 0.58381539, 0.66443556]])

In [22]:
df = pd.DataFrame(acc_matrix * 100, columns=[f'accuracy on {i}' for i in ground_truth], index=[f'model trained on {i}' for i in ground_truth])

In [24]:
df

Unnamed: 0,accuracy on pol_dat_us,accuracy on pol_dat_ca,accuracy on pol_dat_uk,accuracy on pol_fb_us
model trained on pol_dat_us,60.295444,54.363696,54.967109,62.12946
model trained on pol_dat_ca,56.243147,70.100712,60.649473,63.841212
model trained on pol_dat_uk,58.805707,62.454933,67.188055,63.731971
model trained on pol_fb_us,61.386233,59.408242,59.876788,68.001629


In [46]:
df.to_csv("./results/table_2_reproduced.csv")

In [47]:
import joblib

def save_model(model, filepath):
    joblib.dump(model, filepath)

for sample_name in ground_truth:
    save_model(models[sample_name], f'./saved_model/lasso_{sample_name}.joblib')