**Libraries**

In [1]:
import pandas as pd
import json
import numpy as np

# Radar chart - Data generation

> The main goal of this notebook is to preprocess data so that it fits the required format when uploading to `scriptRadar.js` and `radarChart.js`.

**Functions**

In [2]:
def attributes_by_gender(df, attributes):
    ''' Given a list of attribues, computes mean of each attribute values given during questionnaries, by gender.
    
        Parameters 
        -----------
            df : [DataFrame] : Initial df containing iid, wave, gender and attribute list
            attributes : [List] : List containing column names for attributes to be grouped
            
        Output
        -----------
            DataFrame by gender, containing mean values for each attribute in "attributes" list '''
    
    df_temp = df[['iid', 'wave', 'gender'] + attributes]
    df_res = df_temp.drop_duplicates().dropna() # Remove duplicates as each candidate has several entries in initial dataframe
    df_res = df_res[['gender'] + attributes].groupby('gender').mean()
    df_res = df_res.reset_index()
    
    return df_res

### 1. Load Data

In [3]:
# Path to data
path = '../data'

In [4]:
df = pd.read_csv(f'{path}/SpeedDating.csv', encoding='latin1')
df.head()

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,


### 2. Preprocessing

Data must respect a specific shape to be processed into **radar chart** in javascript.  
Below, we computes mean values for following attributes :  
* `attr` : Attractiveness  
* `sinc` : Sincerity  
* `intel` : Intelligence  
* `fun` : Fun  
* `amb` : Ambition  

These values are computed at 3 different steps in the Speed Dating process : **before**, **during** and **after** it.  

Four questions are answered by candidates (for each one of them, candidates chose corresponding values for attributes). Refer to [official documentation](https://perso.telecom-paristech.fr/eagan/class/igr204/data/SpeedDatingKey.pdf) for more information about these questions.  
In the table below, these questions are encoded from 1 to 4.  

**Note** : The `question` column in result table is encoded. Its values are composed by 2 elements :  
* first digit : number of question (from 1 to 4)  
* second digit : time at which question was asked (from 1 to 3)

In [14]:
key = ['iid', 'wave', 'gender']
name_attributes = ['attr', 'sinc', 'intel', 'fun', 'amb'] # Names of attributes to deal with
num_times = 3
num_questions = 4

# Initial radar DataFrame
df_radar = pd.DataFrame([])

for time in range(num_times):
    
    for question in range(num_questions):
        
        list_attributes = []

        for attr in name_attributes:
            list_attributes.append(f'{attr}{question+1}_{time+1}')

        # Preprocess data : computes mean of attributes by gender
        df_res = attributes_by_gender(df, list_attributes)

        # Preprocess data : computes mean of attributes in total
        df_res_tot = pd.DataFrame(df_res.mean(axis=0)).T

        # Add information about question and time in dataframes
        df_res['question'] = int(str(question+1) + str(time+1))
        df_res_tot['question'] = int(str(question+1) + str(time+1))
        df_res.columns = ['gender'] + name_attributes +  ['question']
        df_res_tot.columns = ['gender'] + name_attributes +  ['question']

        # Concat results in radar DataFrame
        df_radar = pd.concat([df_radar, df_res, df_res_tot], axis=0)

In [15]:
df_radar.head()

Unnamed: 0,gender,attr,sinc,intel,fun,amb,question
0,0.0,18.020372,18.22223,20.971004,17.299108,12.818476,11
1,1.0,27.008864,16.389707,19.41956,17.592051,8.823956,11
0,0.5,22.514618,17.305969,20.195282,17.44558,10.821216,11
0,0.0,35.600632,11.284535,12.478439,19.051636,9.114387,21
1,1.0,24.884526,15.108467,16.35427,18.044416,14.357482,21


### 3. Save data

In [None]:
preprocessed_path = '../webapp/preprocessed_data'

In [None]:
df_radar.to_csv(f'{preprocessed_path}/radar_all.csv')

# Flower Chart - Data generation

In [5]:
df_temp_flower = df[['gender', 'age',
                     'attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1',
                     'attr4_1','sinc4_1', 'intel4_1', 'fun4_1', 'amb4_1', 'shar4_1']]
df_flower = df_temp_flower.dropna(axis=0).reset_index(drop=True)


print(f'Nous gardons les lignes qui ont des données sur l\'ensemble des colonnes listées: {len(df_flower)/len(df_temp_flower):.0%}')

Nous gardons les lignes qui ont des données sur l'ensemble des colonnes listées: 77%


In [6]:
df_flower

Unnamed: 0,gender,age,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr4_1,sinc4_1,intel4_1,fun4_1,amb4_1,shar4_1
0,0,27.0,16.67,16.67,16.67,16.67,16.67,16.67,10.0,7.0,7.0,7.0,5.0,7.0
1,0,27.0,16.67,16.67,16.67,16.67,16.67,16.67,10.0,7.0,7.0,7.0,5.0,7.0
2,0,27.0,16.67,16.67,16.67,16.67,16.67,16.67,10.0,7.0,7.0,7.0,5.0,7.0
3,0,27.0,16.67,16.67,16.67,16.67,16.67,16.67,10.0,7.0,7.0,7.0,5.0,7.0
4,0,27.0,16.67,16.67,16.67,16.67,16.67,16.67,10.0,7.0,7.0,7.0,5.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6440,1,25.0,70.00,0.00,15.00,15.00,0.00,0.00,90.0,0.0,0.0,0.0,0.0,10.0
6441,1,25.0,70.00,0.00,15.00,15.00,0.00,0.00,90.0,0.0,0.0,0.0,0.0,10.0
6442,1,25.0,70.00,0.00,15.00,15.00,0.00,0.00,90.0,0.0,0.0,0.0,0.0,10.0
6443,1,25.0,70.00,0.00,15.00,15.00,0.00,0.00,90.0,0.0,0.0,0.0,0.0,10.0


In [7]:
''' On normalise à 100 les attr4'''
cols_attr4 = ['attr4_1','sinc4_1', 'intel4_1', 'fun4_1', 'amb4_1', 'shar4_1']
for row in range(len(df_flower)):
    sum_attr4 = df_flower[cols_attr4].iloc[row,].sum() 
    if sum_attr4 != 100:
        for col in cols_attr4:
            df_flower.at[row, col]  = round(df_flower[col].iloc[row,]/sum_attr4*100, 2)

In [8]:
df_flower[cols_attr4].sum(axis="columns").sum()/len(df_flower)

100.0009449185415

In [9]:
df_flower

Unnamed: 0,gender,age,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr4_1,sinc4_1,intel4_1,fun4_1,amb4_1,shar4_1
0,0,27.0,16.67,16.67,16.67,16.67,16.67,16.67,23.26,16.28,16.28,16.28,11.63,16.28
1,0,27.0,16.67,16.67,16.67,16.67,16.67,16.67,23.26,16.28,16.28,16.28,11.63,16.28
2,0,27.0,16.67,16.67,16.67,16.67,16.67,16.67,23.26,16.28,16.28,16.28,11.63,16.28
3,0,27.0,16.67,16.67,16.67,16.67,16.67,16.67,23.26,16.28,16.28,16.28,11.63,16.28
4,0,27.0,16.67,16.67,16.67,16.67,16.67,16.67,23.26,16.28,16.28,16.28,11.63,16.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6440,1,25.0,70.00,0.00,15.00,15.00,0.00,0.00,90.00,0.00,0.00,0.00,0.00,10.00
6441,1,25.0,70.00,0.00,15.00,15.00,0.00,0.00,90.00,0.00,0.00,0.00,0.00,10.00
6442,1,25.0,70.00,0.00,15.00,15.00,0.00,0.00,90.00,0.00,0.00,0.00,0.00,10.00
6443,1,25.0,70.00,0.00,15.00,15.00,0.00,0.00,90.00,0.00,0.00,0.00,0.00,10.00


In [10]:
df_flower_ag = df_flower.groupby(["gender", "age"]).mean()
df_flower_ag["nb_instances"] = df_flower.groupby(["gender", "age"]).count()["attr1_1"]
df_flower_ag.reset_index(inplace=True)
df_flower_ag

Unnamed: 0,gender,age,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr4_1,sinc4_1,intel4_1,fun4_1,amb4_1,shar4_1,nb_instances
0,0,21.0,26.153846,22.153846,17.538462,15.384615,7.384615,11.384615,46.307692,7.384615,11.384615,12.846154,11.769231,10.307692,65
1,0,22.0,15.431159,18.359783,19.624275,18.116304,13.789855,14.677899,21.822464,16.446667,16.491594,17.98058,13.901232,13.358551,276
2,0,23.0,18.132997,17.314409,19.695389,19.606916,13.112104,12.772622,27.907839,12.617896,15.123026,17.733689,12.366945,14.255159,347
3,0,24.0,19.925498,19.095448,19.576517,16.57005,13.904502,10.927985,29.987662,13.486468,14.547413,16.847463,10.756244,14.374279,402
4,0,25.0,15.950954,18.80545,21.084469,18.722071,12.784741,12.653406,23.718801,13.340054,15.531335,19.182016,15.010899,13.216894,367
5,0,26.0,18.680024,17.207573,20.427112,17.480752,12.719102,13.220558,28.412063,11.826626,16.001869,16.240218,16.115947,11.403811,412
6,0,27.0,18.48855,18.159849,18.310906,17.290363,13.294592,14.456647,27.06435,13.796858,14.776073,18.000846,13.5729,12.793233,331
7,0,28.0,15.406806,19.204861,23.586181,16.564514,13.066111,12.170833,32.423958,10.145764,13.580625,16.260903,15.714167,11.878472,288
8,0,29.0,20.913208,21.781761,21.645912,14.420755,8.82956,12.410692,23.12327,16.358491,16.200629,15.119497,14.111321,15.085535,159
9,0,30.0,15.387006,18.620339,20.741243,15.972881,11.862712,16.173446,22.690508,15.782147,16.672542,14.956045,17.358983,12.54,177


In [15]:
preprocessed_path = '../webapp/static/preprocessed_data'

In [16]:
df_flower.to_csv(f'{preprocessed_path}/flower_data.csv', index=False)

In [17]:
df_flower_ag.to_csv(f'{preprocessed_path}/flower_data_aggregated.csv', index=False)