# Replication of Table 2

This notebook is trying to replicate Table 2 in Kosinksi's paper, i.e. train 

In [1]:
import pandas as pd
import numpy as np

In [2]:
from data.load_data import load_faces, load_features

features = load_features()
faces = load_faces()

In [4]:
def get_sample_and_label(row):
    for i in ground_truth:
        if ~np.isnan(row[i]):
            return i, row[i]
    print(f"No label found for userid: {row['userid']}")

In [5]:
ground_truth = ['pol_dat_us', 'pol_dat_ca', 'pol_dat_uk', 'pol_fb_us']

In [6]:
def get_not_nan_labels(colname):
    col = faces[colname]
    return col[col.notna()]

In [7]:
samples = {k: get_not_nan_labels(k) for k in ground_truth}

In [8]:
np.array(samples[ground_truth[0]].values)

array([1., 1., 0., ..., 0., 0., 1.])

In [9]:
np.array(samples[ground_truth[0]].index)

array([    759,     790,     943, ..., 1085790, 1085793, 1085795])

In [10]:
np.random.seed(67)

In [11]:
# num_of_training_samples = len(train_idx)

In [12]:
max_chunksize = 271448 # len(features) // 4

In [13]:
# number_of_minibatches = 4
# minibatch_chunk_size = num_of_training_samples // number_of_minibatches
# minibatch_chunk_size

In [14]:
from sklearn.linear_model import SGDClassifier, Lasso

model = SGDClassifier(alpha=0.01, penalty='elasticnet',loss='log')
# model = Lasso(alpha=0.1)

In [15]:
def get_binary_accuracy(preds, actual):
    num_of_correct_predictions = (preds == actual).sum()
    accuracy = num_of_correct_predictions / len(actual)
    return accuracy

In [22]:
from tqdm import tqdm, trange

max_chunksize = 271448 # len(features) // 4

params = {
    "alpha":0.01, "penalty":'elasticnet',"loss":'log'
}
models = {}

with tqdm(ground_truth) as t:
    for sample_name in t:
        t.set_description(f'Sample {sample_name}')
        model = SGDClassifier(**params)
        sample = samples[sample_name]
        indexes = np.array(sample.index) - 1
        values = np.array(sample.values)
        
        
        num_of_batches = len(indexes) // max_chunksize
        
        if num_of_batches == 0:
            y = values
            X = features[indexes]
            model.fit(X, y)
        else:
            for i in range(num_of_batches):
                batch_idx = indexes[max_chunksize * i : max_chunksize * (i + 1)]
                y = values[max_chunksize * i : max_chunksize * (i + 1)]
                X = features[batch_idx]
                model.partial_fit(X, y, classes=[0,1])
        
        models[sample_name] = model

Sample pol_fb_us: 100%|███████████████████████████| 4/4 [01:18<00:00, 19.57s/it]


In [23]:
import joblib

def save_model(model, filepath):
    joblib.dump(model, filepath)

In [24]:
models

{'pol_dat_us': SGDClassifier(alpha=0.01, loss='log', penalty='elasticnet'),
 'pol_dat_ca': SGDClassifier(alpha=0.01, loss='log', penalty='elasticnet'),
 'pol_dat_uk': SGDClassifier(alpha=0.01, loss='log', penalty='elasticnet'),
 'pol_fb_us': SGDClassifier(alpha=0.01, loss='log', penalty='elasticnet')}

In [25]:
for sample_name in ground_truth:
    save_model(models[sample_name], f'./saved_model/lasso_{sample_name}.joblib')

In [27]:
acc_matrix = np.zeros([4,4])

with tqdm(enumerate(ground_truth)) as t:
    for i, sample_name in enumerate(ground_truth):
        t.set_description(f'Sample {sample_name}')
        model = models[sample_name]
        
        for j, test_sample in enumerate(ground_truth):
            t.set_postfix({'imputing':test_sample})
            
            sample = samples[test_sample]
            indexes = np.array(sample.index) - 1
            values = np.array(sample.values)
            
            X = features[indexes]
            y_pred = model.predict(X)
            
            acc = get_binary_accuracy(y_pred, values)
            
            acc_matrix[i,j] = acc


Sample pol_fb_us: : 0it [01:45, ?it/s, imputing=pol_fb_us]  


In [28]:
acc_matrix

array([[0.65417435, 0.61575936, 0.61173645, 0.66468552],
       [0.55716935, 0.69984506, 0.62336849, 0.64930845],
       [0.58892521, 0.64507911, 0.66893599, 0.64733656],
       [0.62280098, 0.62462382, 0.61390832, 0.68395082]])

In [37]:
df = pd.DataFrame(acc_matrix * 100, columns=[f'accuracy on {i}' for i in ground_truth], index=[f'model trained on {i}' for i in ground_truth])

In [39]:
df

Unnamed: 0,accuracy on pol_dat_us,accuracy on pol_dat_ca,accuracy on pol_dat_uk,accuracy on pol_fb_us
model trained on pol_dat_us,65.417435,61.575936,61.173645,66.468552
model trained on pol_dat_ca,55.716935,69.984506,62.336849,64.930845
model trained on pol_dat_uk,58.892521,64.507911,66.893599,64.733656
model trained on pol_fb_us,62.280098,62.462382,61.390832,68.395082


In [40]:
df.to_csv("./results/table_2_reproduced.csv")