# Dissmiliarity (Tugas 1)

Mengukur jarak (dissimilarity)
1. Ambil data dari Kaggle atau GitHub
2. Ukur jarak 
d(1,2), d(1,3), d(1,4) dari data tersebut

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Create dataset from Google Drive
dataset_url = "https://raw.githubusercontent.com/prasertcbs/basic-dataset/master/Employee%20data.csv"
df = pd.read_csv(dataset_url)

In [3]:
df

Unnamed: 0,id,gender,bdate,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,1.0,Male,1952-02-03,15,Manager,57000.0,27000.0,98.0,144.0,No
1,2.0,Male,1958-05-23,16,Clerical,40200.0,18750.0,98.0,36.0,No
2,3.0,Female,1929-07-26,12,Clerical,21450.0,12000.0,98.0,381.0,No
3,4.0,Female,1947-04-15,8,Clerical,21900.0,13200.0,98.0,190.0,No
4,5.0,Male,1955-02-09,15,Clerical,45000.0,21000.0,98.0,138.0,No
...,...,...,...,...,...,...,...,...,...,...
469,470.0,Male,1964-01-22,12,Clerical,26250.0,15750.0,64.0,69.0,Yes
470,471.0,Male,1966-08-03,15,Clerical,26400.0,15750.0,64.0,32.0,Yes
471,472.0,Male,1966-02-21,15,Clerical,39150.0,15750.0,63.0,46.0,No
472,473.0,Female,1937-11-25,12,Clerical,21450.0,12750.0,63.0,139.0,No


In [4]:
# Show dataset shape
number_of_columns = df.shape[1]

In [5]:
# Show all columns for dataset
pd.set_option('display.max_columns', number_of_columns)
pd.set_option('display.max_rows', number_of_columns)

In [6]:
df[["id","gender", "jobcat"]].head(5)

Unnamed: 0,id,gender,jobcat
0,1.0,Male,Manager
1,2.0,Male,Clerical
2,3.0,Female,Clerical
3,4.0,Female,Clerical
4,5.0,Male,Clerical


In [7]:
# jobcat code
code_jobcat_for_manager = "Manager"
code_jobcat_for_clerical = "Clerical"

# gender code
code_gender_for_male = "Male"
code_gender_for_female = "Female"

# binary value
value_of_one = 1
value_of_zero = 0

def change_code_jobcat_to_biner(jobcat):
    return value_of_one if jobcat == code_jobcat_for_manager else value_of_zero
def change_code_gender_to_biner(gender):
    return value_of_one if gender == code_gender_for_male else value_of_zero

In [8]:
# Update all values of 'jobcat' series
df["jobcat"] = df["jobcat"].apply(change_code_jobcat_to_biner)

In [9]:
# Update all values of 'minority' series
df["gender"] = df["gender"].apply(change_code_gender_to_biner)

In [10]:
df[["id","gender", "minority"]].head(5)

Unnamed: 0,id,gender,minority
0,1.0,1,No
1,2.0,1,No
2,3.0,0,No
3,4.0,0,No
4,5.0,1,No


In [11]:
# CONSTAN VARIABLE
DECREMENT_BY_ONE = 1
INCREMENT_BY_ONE = 1

CONTINGENCY_TABLE_VALUE = {
    "q" : (1,1),
    "r" : (1,0),
    "s" : (0,1),
    "t" : (0,0),
}

In [12]:
def get_series(df, idx, series):
    return df.loc[(idx), series]

In [13]:
def get_dissimilarity_dataset(df, series_index = [], series = []):
    first_series = get_series(df, series_index[0], series)
    second_series = get_series(df, series_index[1], series)
    dataset = pd.concat([first_series,second_series],axis=1)
    return dataset.T

In [14]:
get_dissimilarity_dataset(df, [1,2], ["gender", "minority"]).T

Unnamed: 0,1,2
gender,1,0
minority,No,No


In [15]:
df.loc[0:5, ["gender", "minority"]]

Unnamed: 0,gender,minority
0,1,No
1,1,No
2,0,No
3,0,No
4,1,No
5,1,No


In [16]:
def count_contingency_value(df, start_index = 0, last_index = 1):

    CONTINGENCY_VALUE = {
        "q" : 0,
        "r" : 0,
        "s" : 0,
        "t" : 0,
    }

    column_range = df.shape[1]

    for column in range(column_range):
        for value in CONTINGENCY_TABLE_VALUE:
            item = list((tuple(df.loc[(start_index):(last_index), df.columns[column]]) == CONTINGENCY_TABLE_VALUE[value], value))
            if item[0] == True:
                if item[1] == "q":
                    CONTINGENCY_VALUE["q"] += 1
                if item[1] == "r":
                    CONTINGENCY_VALUE["r"] += 1
                if item[1] == "s":
                    CONTINGENCY_VALUE["s"] += 1
                if item[1] == "t":
                    CONTINGENCY_VALUE["t"] += 1

    return CONTINGENCY_VALUE

In [17]:
# d(1,2)
df_1_2 = get_dissimilarity_dataset(df, [1,2], ["gender", "minority"])

In [18]:
c_d_1_2 = count_contingency_value(df_1_2, 1, 2)

In [19]:
# d(1,3)
df_1_3 = get_dissimilarity_dataset(df, [1,3], ["gender", "minority"])

In [20]:
c_d_1_3 = count_contingency_value(df_1_3, 1, 3)

In [21]:
# d(1,4)
df_1_4 = get_dissimilarity_dataset(df, [1,4], ["gender", "minority"])

In [22]:
c_d_1_4 = count_contingency_value(df_1_4, 1, 4)

In [23]:
def measure_dissimilarity_binary_value_assymetric_distance(contingency_value):

    return (contingency_value["r"] + contingency_value["s"]) / (contingency_value["q"] + contingency_value["r"] + contingency_value["s"])

In [24]:
d_1_2 = measure_dissimilarity_binary_value_assymetric_distance(c_d_1_2)
d_1_3 = measure_dissimilarity_binary_value_assymetric_distance(c_d_1_3)
d_1_4 = measure_dissimilarity_binary_value_assymetric_distance(c_d_1_4)

In [25]:
d_1_2

1.0

In [26]:
d_1_3

1.0

In [27]:
d_1_4

0.0