# Test de segmentation des cartes selon le profil du répondant

**Important** nécessite le fichier des données personnelles qui n'est _pas_ versionné dans le dépôt.

In [1]:
from IPython.display import display
from anonymizer import get_data, PERSONAL_DATA_FILE

# les data
df = get_data(PERSONAL_DATA_FILE, drop=True) 

In [2]:
df.columns

Index(['date', 'commune_enquete', 'travail_mine', 'famille_mine',
       'habitant_nc', 'commune_residence', 'commune_miniere',
       'habite_ailleurs_nc', 'genre', 'age_interval',
       'duree_residence_interval', 'duree_presence_nc_interval',
       'duree_travail_mine_interval'],
      dtype='object')

In [3]:
# df.groupby(["travail_mine", "famille_mine"]).count()
df.value_counts(subset=["travail_mine", "famille_mine"], sort = False)

travail_mine  famille_mine
False         False           115
              True            232
True          False            11
              True             46
dtype: int64

In [4]:
# df.groupby(["age_interval"]).count()
df.value_counts(subset=["age_interval"], sort = False)

age_interval
[10.0, 20.0)    158
[20.0, 30.0)    152
[30.0, 40.0)     20
[40.0, 50.0)     23
[50.0, 60.0)     34
[60.0, 70.0)     10
[70.0, 80.0)      4
[80.0, 90.0)      2
dtype: int64

In [5]:
partition = { True : list(df[df.travail_mine].index), False: list(df[~ df.travail_mine].index) }
print(partition)
for key, vals in partition.items():
    print(key, len(vals))

{True: [5, 8, 17, 19, 20, 23, 26, 27, 29, 30, 31, 32, 33, 34, 37, 38, 39, 42, 46, 47, 53, 55, 57, 58, 61, 74, 103, 106, 150, 184, 191, 230, 246, 256, 283, 289, 292, 294, 301, 302, 333, 336, 356, 357, 369, 371, 372, 374, 385, 389, 390, 391, 392, 393, 399, 401, 408], False: [1, 2, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 24, 25, 28, 35, 36, 41, 43, 44, 48, 49, 50, 51, 52, 54, 56, 59, 60, 62, 63, 65, 67, 68, 69, 70, 71, 72, 73, 76, 77, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 190, 192, 193, 194, 195, 196, 197, 198, 199, 200, 

In [6]:
from cog_maps import CogMaps, THESAURUS_FILENAME, WEIGHTS_MAP_FILENAME, CM_LA_MINE_FILENAME

thesaurus = CogMaps.load_thesaurus_map(THESAURUS_FILENAME)
weights = CogMaps.load_weights(WEIGHTS_MAP_FILENAME)

mine_map = CogMaps(CM_LA_MINE_FILENAME, predicate=lambda i : i in partition[True])
print(len(mine_map))

non_mine_map = CogMaps(CM_LA_MINE_FILENAME, predicate=lambda i : i in partition[False])
print(len(non_mine_map))

57
347


In [7]:
display(weights.keys())

dict_keys(['arithmetique', 'inverse', 'pos_1', 'pos_3', 'pos_6', 'pos_3_arith', 'pos_6_arith', 'exponentielle'])

In [8]:
all_mine_maps, _ = mine_map.apply_many(thesaurus, with_unknown=False)
all_non_mine_maps, _ = non_mine_map.apply_many(thesaurus, with_unknown=False)
print(all_mine_maps.keys())
print(all_non_mine_maps.keys())

CHOSEN_WEIGHTS = "inverse"
for name, a_map in all_mine_maps.items():
    a_map.weights = weights[CHOSEN_WEIGHTS]
for name, a_map in all_non_mine_maps.items():
    a_map.weights = weights[CHOSEN_WEIGHTS]


dict_keys(['base', 'concept', 'mother', 'gd_mother'])
dict_keys(['base', 'concept', 'mother', 'gd_mother'])


In [13]:
from operator import itemgetter 

CHOSEN_LEVEL = "mother"
mine_mother = all_mine_maps[CHOSEN_LEVEL]
non_mine_mother = all_non_mine_maps[CHOSEN_LEVEL]

mine_frequence = [(w, round(100*n/len(mine_mother))) for w,n in mine_mother.occurrences.items()]
non_mine_frequence = [(w, round(100*n/len(non_mine_mother))) for w,n in non_mine_mother.occurrences.items()]


display("Pour ceux qui travaillent à la mine (en % de citation pondéré par la position)")
display(sorted(mine_frequence, key=itemgetter(1), reverse = True)[:10])
display("Pour ceux qui NE travaillent PAS à la mine (en % de citation pondéré par la position)")
display(sorted(non_mine_frequence, key=itemgetter(1), reverse = True)[:10])

'Pour ceux qui travaillent à la mine (en % de citation pondéré par la position)'

[('impact environnemental', 68),
 ('pollution', 53),
 ('emploi', 51),
 ('nickel', 44),
 ('économie du pays', 37),
 ('revenu', 26),
 ('engin', 25),
 ('développement', 23),
 ('terre', 23),
 ('richesse', 19)]

'Pour ceux qui NE travaillent PAS à la mine (en % de citation pondéré par la position)'

[('impact environnemental', 62),
 ('emploi', 50),
 ('nickel', 49),
 ('pollution', 42),
 ('minerai', 35),
 ('usine', 31),
 ('économie du pays', 30),
 ('exploitation', 29),
 ('engin', 25),
 ('opérateur minier', 23)]

In [10]:
from functools import partial
from graphize import cog_map_to_graph
from draw_graphviz import draw_graphviz
import networkx as nx

draw = partial(
    draw_graphviz,
    algorithm="sfdp",
    sep=0.01,
    fontsize=12,  # "proportional",
    node_color="weight",
    min_edge_penwidths=2,
    max_edge_penwidths=12,
    min_node_size=0.02,
    max_node_size=1,
)



In [11]:
a_mine_graph = cog_map_to_graph(all_mine_maps["mother"], threshold=2.0)
draw(a_mine_graph, "mine_test.svg")

a_non_mine_graph = cog_map_to_graph(all_non_mine_maps["mother"], threshold=2.0)
draw(a_non_mine_graph, "non_mine_test.svg")


