In [1]:
import pandas as pd

from scipy.sparse import lil_matrix
from collections import Counter
from voyager import Index, Space

import sklearn
import networkx as nx

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
url_census = 'https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv'
url_cis = 'https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv'

In [3]:
#wczytanie danych
census = pd.read_csv(url_census)
cis = pd.read_csv(url_cis)

In [4]:
census.head()

Unnamed: 0,person_id,pername1,pername2,sex,dob_day,dob_mon,dob_year,enumcap,enumpc
0,DE03US001001,COUIE,PRICE,M,1.0,6,1960.0,1 WINDSOR ROAD,DE03US
1,DE03US001002,ABBIE,PVICE,F,9.0,11,1961.0,1 WINDSOR ROAD,DE03US
2,DE03US001003,LACEY,PRICE,F,7.0,2,1999.0,1 WINDSOR ROAD,DE03US
3,DE03US001004,SAMUEL,PRICE,M,13.0,4,1990.0,1 WINDSOR ROAD,DE03US
4,DE03US001005,JOSEPH,PRICE,M,20.0,4,1986.0,1 WINDSOR ROAD,DE03US


In [5]:
#Utworzenie pliku z parami za pomocą person_id
cis.reset_index(inplace=True)
census.reset_index(inplace=True)
merged_df = pd.merge(census, cis, on='person_id', suffixes=('_census', '_cis'))
tp = merged_df[['index_census', 'index_cis']]
print(tp)

     index_census  index_cis
0              19      21255
1              34       9523
2              43       6753
3              80      17311
4              97      12385
..            ...        ...
966         25277      16237
967         25283      10250
968         25308      15945
969         25318       6675
970         25322      14776

[971 rows x 2 columns]


In [6]:
#definiujemy funcję do złączenia wartości ze wszystkich kolumn
def concatenate_columns(row, dataframe):
    concatenated_string = ""
    for i in range(1, len(dataframe.columns)):
        value = row[i]
        if isinstance(value, str):
            value = value.lower()
            value = value.replace(" ", "")
        elif pd.isnull(value) or value == "null" or value == "nan":
            continue
        concatenated_string += str(value)
    return concatenated_string

In [7]:
#tworzymy puste kolumny
census['allInfo'] = None
cis['allInfo'] = None

In [8]:
#wypełniamy kolumny danymi
for i in range (0, len(census)):
    census.at[i, 'allInfo'] = concatenate_columns(census.iloc[i], census)

In [9]:
for i in range (0, len(cis)):
    cis.at[i, 'allInfo'] = concatenate_columns(cis.iloc[i], cis)

In [10]:
#definiujemy funkcję do generowania shingli
def generate_shingles(text, length):
    shingles = set()
    for i in range(len(text) - length + 1):
        shingle = text[i:i+length]
        shingles.add(shingle)
    return shingles

In [11]:
#ilość wierszy w pliku
n = census.shape[0]
n

25343

In [12]:
#generujemy shingle dla pliku census
all_shingles_census_set = set()
for i, row in census.iterrows():
    all_shingles_census_set |= generate_shingles(row['allInfo'], 2)

print(len(all_shingles_census_set))

1161


In [13]:
#rzutujemy zbiór na listę
all_shingles_census = list(all_shingles_census_set)
all_shingles_census_len = len(all_shingles_census)
print(all_shingles_census_len)

1161


In [14]:
all_shingles_census

['5i',
 'sx',
 'om',
 'ov',
 '05',
 'za',
 'tn',
 'hv',
 'kt',
 'lf',
 '1d',
 '3z',
 '9d',
 'w4',
 '6p',
 'hi',
 '1h',
 'ap',
 'wn',
 'b8',
 'fm',
 '2b',
 '56',
 '9a',
 'jt',
 '7z',
 'ol',
 'ox',
 'eh',
 'nh',
 '9u',
 'e-',
 're',
 'sp',
 'ml',
 'w7',
 'xo',
 'rx',
 '1r',
 '3s',
 'bs',
 'n1',
 'mj',
 'hl',
 '4v',
 'ju',
 'sz',
 '5s',
 '94',
 '1e',
 't2',
 'g9',
 'm8',
 '8r',
 'ku',
 'rm',
 'y2',
 'cc',
 'bb',
 'yq',
 'rb',
 '0m',
 '6z',
 '8t',
 'fp',
 'lv',
 'rp',
 '4m',
 'x0',
 'ez',
 'wi',
 '23',
 'tj',
 'l2',
 'zb',
 'ly',
 'y0',
 '01',
 'x3',
 'lk',
 'mi',
 'rl',
 'ug',
 'qq',
 'ot',
 'jx',
 'ze',
 'g2',
 'iw',
 'c1',
 'n8',
 'no',
 'pq',
 'mv',
 '30',
 '91',
 '74',
 '8n',
 'dp',
 'pe',
 'mr',
 '9y',
 'jj',
 '1b',
 '17',
 's0',
 '9.',
 'i-',
 'ao',
 'f7',
 '3p',
 'ig',
 'fk',
 'xq',
 'xr',
 'bn',
 'cu',
 '8f',
 '2q',
 '3q',
 '8q',
 '8g',
 'w6',
 'k2',
 'k1',
 'gm',
 'us',
 'fu',
 'pl',
 'ya',
 'dr',
 'xx',
 '5o',
 '8w',
 'q0',
 '87',
 '1v',
 '0l',
 'ru',
 'ei',
 'm9',
 '1c',
 '3h',

In [15]:
#tworzymy rzadką macierz wystąpień dla filtered_census
occurrence_matrix_census = lil_matrix((n, all_shingles_census_len), dtype=int)


for i, row in census.iterrows():
    shingles_in_text = generate_shingles(row['allInfo'], 2)
    shingle_counts = Counter(shingles_in_text)
    for j, shingle in enumerate(all_shingles_census):
        if shingle in shingle_counts:
            occurrence_matrix_census[i, j] = shingle_counts[shingle]

sparse_matrix_census = occurrence_matrix_census.tocsr()

In [16]:
#tworzymy rzadką macierz wystąpień dla cis
occurrence_matrix_cis = lil_matrix((len(cis), all_shingles_census_len), dtype=int)
all_shingles_list = list(all_shingles_census)
for i, row in cis.iterrows():
    shingles_in_text = generate_shingles(row['allInfo'], 2)
    shingle_counts = Counter(shingles_in_text)
    for j, shingle in enumerate(all_shingles_census[:all_shingles_census_len]):
        if shingle in shingles_in_text:

            occurrence_count = shingle_counts.get(shingle, 0)

            occurrence_matrix_cis[i, j] = occurrence_count

sparse_matrix_cis = occurrence_matrix_cis.tocsr()

In [17]:
sparse_matrix_census.shape, sparse_matrix_cis.shape

((25343, 1161), (24613, 1161))

In [19]:
#tworzymy index voyagera dla cis
index_cis = Index(Space.Cosine, num_dimensions=all_shingles_census_len)

for i in range(len(cis)):
    index_cis.add_item(sparse_matrix_cis[i].toarray().flatten())

In [21]:
#sprawdzenie czy działa
neighbors, distances = index_cis.query(sparse_matrix_census[1].toarray().flatten(), k=3)

print(neighbors)
print(distances)

[ 8583  8151 20589]
[0.11501491 0.30677998 0.32324684]


In [22]:
print(cis.iloc[8583])
print(census.iloc[1])

index                                            8583
person_id                                         NaN
pername1                                        ABBIE
pername2                                        PRICE
sex                                                 F
dob_day                                           9.0
dob_mon                                            11
dob_year                                       1961.0
enumcap                                1 WINDSOR ROAD
enumpc                                         DE03US
allInfo      abbiepricef9.0111961.01windsorroadde03us
Name: 8583, dtype: object
index                                                        1
person_id                                         DE03US001002
pername1                                                 ABBIE
pername2                                                 PVICE
sex                                                          F
dob_day                                                    9.0
do

In [23]:
#tworzymy csv z parami rekord - najbliższy sąsiad
pairs = []

for i in range(sparse_matrix_census.shape[0]):
    query_vector_census = sparse_matrix_census[i].toarray().flatten()
    neighbor, distance = index_cis.query(query_vector_census, k=50)
    index_neighbor = neighbor[0]
    index = f'A{i}'
    nearest_neighbor = f'B{str(index_neighbor).replace("[", "").replace("]", "")}'
    distance = str(distance[0]).replace("[", "").replace("]", "")
    fdistance = float(distance)
    pairs.append((index, nearest_neighbor, fdistance))

# Konwertuj listę tupli na DataFrame
df = pd.DataFrame(pairs, columns=['index_census', 'index_cis', 'Distance'])
df.head()


Unnamed: 0,index_census,index_cis,Distance
0,A0,B8151,0.075707
1,A1,B8583,0.115015
2,A2,B20589,0.065919
3,A3,B18455,0.110425
4,A4,B17256,0.095136


In [24]:
Gr = nx.Graph()

for i in range(len(census)):
    Gr.add_node(f'A{i}', record=census.iloc[[i]])

for j in range(len(cis)):
    Gr.add_node(f'B{j}', record=cis.iloc[[j]])


for _, row in df.iterrows():
    if row['Distance'] < 0.4:
        Gr.add_edges_from([(row["index_census"], row["index_cis"])])

In [25]:
found_pairs = [component for component in nx.connected_components(Gr) if len(component) > 1 ]
len(found_pairs)

23789

In [26]:
found_pairs

[{'A0', 'B8151'},
 {'A1', 'B8583'},
 {'A2', 'B20589'},
 {'A3', 'B18455'},
 {'A4', 'B17256'},
 {'A5', 'B19867'},
 {'A6', 'B11182'},
 {'A34', 'A7', 'B9523'},
 {'A9', 'B9369'},
 {'A10', 'B7246'},
 {'A11', 'B10621'},
 {'A12', 'B5851'},
 {'A13', 'B3911'},
 {'A14', 'B20886'},
 {'A15', 'B6410'},
 {'A16', 'B17805'},
 {'A17', 'B13508'},
 {'A18', 'A21', 'B13588'},
 {'A19', 'B21255'},
 {'A20', 'B20764'},
 {'A22', 'B5848'},
 {'A23', 'B6038'},
 {'A24', 'B12662'},
 {'A25', 'B23168'},
 {'A26', 'B13402'},
 {'A27', 'B8172'},
 {'A28', 'B14057'},
 {'A29', 'B11612'},
 {'A30', 'B18890'},
 {'A31', 'B6089'},
 {'A32', 'B20850'},
 {'A33', 'B9826'},
 {'A35', 'B21665'},
 {'A36', 'B16805'},
 {'A37', 'B16537'},
 {'A38', 'B24166'},
 {'A39', 'B24085'},
 {'A40', 'B1722'},
 {'A41', 'B6607'},
 {'A42', 'B833'},
 {'A43', 'B6753'},
 {'A44', 'B2955'},
 {'A45', 'B2836'},
 {'A46', 'B4242'},
 {'A47', 'B9221'},
 {'A48', 'B22329'},
 {'A49', 'B5323'},
 {'A50', 'B5128'},
 {'A51', 'B12941'},
 {'A53', 'A54', 'B2718'},
 {'A55', 'B17

In [29]:
#modyfikujemy wartości w pliku tp do formatu A1, A2, B1, B2 itd
tp['index_census'] = tp['index_census'].apply(lambda x: f'A{x}')
tp['index_cis'] = tp['index_cis'].apply(lambda y: f'B{y}')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tp['index_census'] = tp['index_census'].apply(lambda x: f'A{x}')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tp['index_cis'] = tp['index_cis'].apply(lambda y: f'B{y}')


In [30]:
tp

Unnamed: 0,index_census,index_cis,Distance
0,A19,B21255,3.576279e-07
1,A34,B9523,1.489362e-01
2,A43,B6753,8.228964e-02
3,A80,B17311,6.794083e-02
4,A97,B12385,1.043311e-01
...,...,...,...
966,A25277,B16237,0.000000e+00
967,A25283,B10250,4.444444e-02
968,A25308,B15945,4.878080e-02
969,A25318,B6675,8.888888e-02


In [31]:
#filtrujemy zbiór df w celu zachowania jedynie par, których indeksy dla pliku census pojawiają się w pliku tp (na potrzeby testu jakości)
tp_ids = tp['index_census'].unique()

# Filtruj DataFrame census, aby zachować tylko te rekordy, których 'id' znajduje się w tp_ids
filtered_pairs = df[df['index_census'].isin(tp_ids)]



tp_ids

array(['A19', 'A34', 'A43', 'A80', 'A97', 'A125', 'A133', 'A185', 'A203',
       'A207', 'A232', 'A252', 'A289', 'A337', 'A364', 'A372', 'A373',
       'A391', 'A402', 'A438', 'A451', 'A514', 'A515', 'A548', 'A553',
       'A554', 'A598', 'A703', 'A714', 'A757', 'A786', 'A794', 'A812',
       'A858', 'A866', 'A869', 'A881', 'A961', 'A978', 'A994', 'A1026',
       'A1034', 'A1056', 'A1075', 'A1076', 'A1091', 'A1130', 'A1137',
       'A1198', 'A1260', 'A1340', 'A1380', 'A1400', 'A1415', 'A1446',
       'A1515', 'A1541', 'A1555', 'A1577', 'A1597', 'A1601', 'A1626',
       'A1629', 'A1649', 'A1673', 'A1721', 'A1741', 'A1742', 'A1744',
       'A1787', 'A1813', 'A1842', 'A1862', 'A1903', 'A1910', 'A1962',
       'A1969', 'A1978', 'A1980', 'A2018', 'A2031', 'A2061', 'A2071',
       'A2177', 'A2206', 'A2287', 'A2298', 'A2343', 'A2360', 'A2409',
       'A2439', 'A2463', 'A2483', 'A2496', 'A2564', 'A2575', 'A2587',
       'A2621', 'A2632', 'A2652', 'A2667', 'A2719', 'A2739', 'A2741',
       'A28

In [90]:
len(tp_ids)

971

In [39]:
#tworzymy graf zgodnie z dopasowaniami
G = nx.Graph()

# Dodaj węzły z obu zbiorów danych
for i in range(len(census)):
    G.add_node(f'A{i}', record=census.iloc[i])

for j in range(len(cis)):
    G.add_node(f'B{j}', record=cis.iloc[j])

# Dodaj krawędzie na podstawie najbliższych sąsiadów
for _, row in filtered_pairs.iterrows():
    G.add_edges_from([(row["index_census"], row["index_cis"])])



In [40]:
#tworzymy i wyświetlamy listę klastrów
clusterspp = list(nx.connected_components(G))
clusterspp = [component for component in nx.connected_components(G) if len(component) > 1]



# Wyświetlenie informacji o klastrach
#print(f"Liczba klastrów: {len(clusterspp)}")
#for i, cluster in enumerate(clusterspp):
#    print(f"Klaster {i+1}: {cluster}")

In [41]:
clusterspp

[{'A19', 'B21255'},
 {'A34', 'B9523'},
 {'A43', 'B6753'},
 {'A80', 'B17311'},
 {'A97', 'B12385'},
 {'A125', 'B11308'},
 {'A133', 'B8237'},
 {'A185', 'B16414'},
 {'A203', 'B404'},
 {'A207', 'B1982'},
 {'A232', 'B14635'},
 {'A252', 'B7606'},
 {'A289', 'B11063'},
 {'A337', 'B9911'},
 {'A364', 'B4000'},
 {'A372', 'B341'},
 {'A373', 'B14328'},
 {'A391', 'B18684'},
 {'A402', 'B17029'},
 {'A438', 'B8034'},
 {'A451', 'B20852'},
 {'A514', 'B21348'},
 {'A515', 'B22078'},
 {'A548', 'B155'},
 {'A553', 'B4313'},
 {'A554', 'B899'},
 {'A598', 'B4277'},
 {'A703', 'B24040'},
 {'A714', 'B23312'},
 {'A757', 'B18973'},
 {'A786', 'B16168'},
 {'A794', 'B6417'},
 {'A812', 'B13800'},
 {'A858', 'B24413'},
 {'A866', 'B15622'},
 {'A869', 'B23062'},
 {'A881', 'B17571'},
 {'A961', 'B16288'},
 {'A978', 'B9459'},
 {'A994', 'B23788'},
 {'A1026', 'B14943'},
 {'A1034', 'B23311'},
 {'A1056', 'B24149'},
 {'A1075', 'B69'},
 {'A1076', 'B7609'},
 {'A1091', 'B12459'},
 {'A1130', 'B11070'},
 {'A1137', 'B984'},
 {'A1198', 'B13

In [42]:
#tworzymy graf zgodnie z dopasowaniami
G2 = nx.Graph()

# Dodaj węzły z obu zbiorów danych
for i in range(len(census)):
    G2.add_node(f'A{i}', record=census.iloc[i])

for j in range(len(cis)):
    G2.add_node(f'B{j}', record=cis.iloc[j])

# Dodaj krawędzie na podstawie najbliższych sąsiadów
for _, row in tp.iterrows():
    G2.add_edges_from([(row["index_census"], row["index_cis"])])



In [43]:
#tworzymy graf zgodnie z dopasowaniami
clusterstp = list(nx.connected_components(G2))
clusterstp = [component for component in nx.connected_components(G2) if len(component) > 1]

## Wyświetlenie informacji o klastrach
#print(f"Liczba klastrów: {len(clusterstp)}")
#for i, cluster in enumerate(clusterstp):
#    print(f"Klaster {i+1}: {cluster}")

In [85]:
#definiujemy funkcję zamieniającą wartości klastrów na identyfikatory
def clusters_to_labels(clusters, node_list, start_label=1):
    labels = [0] * len(node_list)
    for cluster_id, cluster in enumerate(clusters, start=start_label):
        for node in cluster:
            labels[node_list.index(node)] = cluster_id
    return labels

In [46]:
#sortujemy
all_nodes = sorted({node for cluster in clusterstp + clusterspp for node in cluster})

In [86]:
#tworzymy zbiory dla miar jakości
labels_true = clusters_to_labels(clusterstp, all_nodes)
labels_pred = clusters_to_labels(clusterspp, all_nodes)

In [71]:
labels_true

[386,
 387,
 388,
 389,
 390,
 391,
 392,
 393,
 394,
 395,
 41,
 396,
 42,
 397,
 398,
 399,
 400,
 401,
 402,
 403,
 404,
 43,
 405,
 406,
 407,
 408,
 409,
 410,
 44,
 45,
 411,
 412,
 413,
 414,
 46,
 415,
 416,
 417,
 418,
 419,
 420,
 421,
 422,
 423,
 424,
 425,
 426,
 47,
 427,
 428,
 429,
 430,
 48,
 431,
 432,
 433,
 434,
 435,
 436,
 437,
 438,
 439,
 440,
 441,
 442,
 443,
 444,
 445,
 446,
 447,
 448,
 449,
 450,
 451,
 452,
 453,
 454,
 49,
 455,
 456,
 457,
 458,
 459,
 460,
 461,
 462,
 463,
 464,
 465,
 466,
 467,
 468,
 469,
 470,
 471,
 472,
 473,
 474,
 6,
 475,
 476,
 477,
 478,
 479,
 50,
 480,
 481,
 482,
 483,
 484,
 485,
 486,
 487,
 488,
 489,
 490,
 491,
 492,
 493,
 494,
 495,
 496,
 497,
 498,
 499,
 500,
 501,
 502,
 503,
 504,
 505,
 506,
 507,
 508,
 7,
 509,
 510,
 511,
 512,
 513,
 51,
 514,
 515,
 516,
 517,
 518,
 519,
 520,
 52,
 521,
 522,
 523,
 524,
 525,
 526,
 53,
 527,
 528,
 529,
 530,
 531,
 532,
 533,
 54,
 534,
 535,
 536,
 537,
 538,
 539

In [87]:
#przykład działania funkcji clusters_to_labels na prostszych danych

data = [{1, 3}, {5, 4}, {2, 6}]
data2 = [{1, 3}, {2, 4}, {7, 5}]

all_data = sorted({node for cluster in data + data2 for node in cluster})
print(clusters_to_labels(data, all_data))
print(clusters_to_labels(data2, all_data))

[1, 3, 1, 2, 2, 3, 0]
[1, 2, 1, 2, 3, 0, 3]


In [48]:
#obliczamy różne miary jakości

sklearn.metrics.f1_score(labels_true, labels_pred, average='weighted')


0.9938398357289527

In [49]:
sklearn.metrics.precision_score(labels_true, labels_pred, average='weighted')

0.9938398357289527

In [50]:
sklearn.metrics.recall_score(labels_true, labels_pred, average='weighted')

0.9938398357289527

In [51]:

sklearn.metrics.homogeneity_score(labels_true, labels_pred)


0.9985768206297091

In [52]:

sklearn.metrics.completeness_score(labels_true, labels_pred)



0.9985768206297091

In [53]:

sklearn.metrics.v_measure_score(labels_true, labels_pred)


0.9985768206297091

In [54]:

sklearn.metrics.adjusted_rand_score(labels_true, labels_pred)



0.9786907460555216

In [55]:
sklearn.metrics.mutual_info_score(labels_true, labels_pred)



6.868238820729755

In [56]:
sklearn.metrics.normalized_mutual_info_score(labels_true, labels_pred)


0.9985768206297091

In [60]:
#przygotowanie zbiorów dopasowań unikalnych dla clusterspp i clusterstp
set_pred = (set(tuple(sorted(pair)) for pair in clusterspp))
set_true = (set(tuple(sorted(pair)) for pair in clusterstp))

print(set_pred - set_true) #pary dopasowane wg modelu, a nie w rzeczywistości
print(set_true - set_pred) #pary dopasowane w rzeczywistości, jednak nie wg modelu

{('A23474', 'B17032'), ('A15495', 'B18122'), ('A22711', 'B20620'), ('A7934', 'B23322'), ('A11100', 'B7037'), ('A8067', 'B13871')}
{('A7934', 'B21171'), ('A22711', 'B13752'), ('A15495', 'B18117'), ('A11100', 'B2727'), ('A8067', 'B4209'), ('A23474', 'B1603')}


In [92]:
set_pred - set_true

{('A11100', 'B7037'),
 ('A15495', 'B18122'),
 ('A22711', 'B20620'),
 ('A23474', 'B17032'),
 ('A7934', 'B23322'),
 ('A8067', 'B13871')}

In [93]:
census.iloc[['11100', '15495', '22711', '23474', '7934', '8067']]

Unnamed: 0,index,person_id,pername1,pername2,sex,dob_day,dob_mon,dob_year,enumcap,enumpc,allInfo
11100,11100,M660XT013001,EMILY,WEBB,F,26.0,5,1935.0,13 SPRINGFIELD ROAD,M660XT,m660xt013001emilywebbf26.051935.013springfield...
15495,15495,PO272DD003001,AMBEV,HING,,23.0,1,1989.0,3 THE CRESCENT,PO272DD,po272dd003001ambevhing23.011989.03thecrescentp...
22711,22711,SW207UU064002,MIA,MORRIS,F,26.0,6,1977.0,64 CLARENCE ROAD,SW207UU,sw207uu064002miamorrisf26.061977.064clarencero...
23474,23474,SW433XS013002,JOSH,STEWART,M,16.0,9,1925.0,13 STANLEY ROAD,SW433XS,sw433xs013002joshstewartm16.091925.013stanleyr...
7934,7934,LS451GL017003,MOLLY,MABSHALL,F,20.0,3,2001.0,17 WOODLANDS ROAD,LS451GL,ls451gl017003mollymabshallf20.032001.017woodla...
8067,8067,LS451GR084002,HARRY,GIBSON,M,18.0,10,1945.0,84 WOODLANDS ROAD,LS451GR,ls451gr084002harrygibsonm18.0101945.084woodlan...


In [94]:
census.iloc[[11100, 15495, 22711, 23474, 7934, 8067]]


Unnamed: 0,index,person_id,pername1,pername2,sex,dob_day,dob_mon,dob_year,enumcap,enumpc,allInfo
2727,2727,M660XT013001,EMILY,WEBB,F,26.0,7,1935.0,,M660XT,m660xt013001emilywebbf26.071935.0m660xt
18117,18117,PO272DD003001,AMBER,KING,,23.0,1,1989.0,,PO272DD,po272dd003001amberking23.011989.0po272dd
13752,13752,SW207UU064002,MIR,MORRISON,F,26.0,6,1977.0,,SW207UU,sw207uu064002mirmorrisonf26.061977.0sw207uu
1603,1603,SW433XS013002,JOSHUA,STEWART,M,16.0,9,1925.0,,SW433XS,sw433xs013002joshuastewartm16.091925.0sw433xs
21171,21171,LS451GL017003,NOLLY,NARSHALL,F,20.0,3,2001.0,,LS451GJ,ls451gl017003nollynarshallf20.032001.0ls451gj
4209,4209,LS451GR084002,HARRY,GIBSON,M,18.0,10,1945.0,,LS451QR,ls451gr084002harrygibsonm18.0101945.0ls451qr


In [None]:
cis.iloc[[2727, 18117, 13752, 1603, 21171, 4209]]

In [None]:
cis.iloc[[7037, 18122, 20620, 17032, 23322, 13871]]