In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.cluster import KMeans
import sklearn.preprocessing as preprocessing


In [238]:
data = pd.read_csv('Indicators.csv')
data.shape

(5656458, 6)

In [3]:
rel_columns_8 = ['Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Birth rate, crude (per 1,000 people)',
 'Net enrolment rate, secondary, both sexes (%)',
 'Net enrolment rate, secondary, female (%)',
 'Net enrolment rate, secondary, male (%)',
 'Teenage mothers (% of women ages 15-19 who have had children or are currently pregnant)',
 'Mortality rate, under-5, female (per 1,000 live births)',
 'Mortality rate, under-5, male (per 1,000 live births)',
 'Prevalence of anemia among children (% of children under 5)',
 'Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)',
 'Cause of death, by non-communicable diseases (% of total)',
 'Survey mean consumption or income per capita, bottom 40% of population (2005 PPP $ per day)',
 'Survey mean consumption or income per capita, total population (2005 PPP $ per day)']

In [24]:
rel_columns_7 = ['Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Age dependency ratio (% of working-age population)',
 'Age dependency ratio, young (% of working-age population)',
 'Birth rate, crude (per 1,000 people)',
 'Fertility rate, total (births per woman)',
 'Life expectancy at birth, female (years)',
 'Life expectancy at birth, male (years)',
 'Life expectancy at birth, total (years)',
 'Mortality rate, adult, female (per 1,000 female adults)',
 'Mortality rate, infant (per 1,000 live births)',
 'Mortality rate, under-5 (per 1,000)',
 'Population, ages 0-14 (% of total)',
 'Population, ages 15-64 (% of total)',
 'Survival to age 65, female (% of cohort)',
 'Survival to age 65, male (% of cohort)',
 'Adjusted net enrolment rate, primary, both sexes (%)',
 'Adjusted net enrolment rate, primary, female (%)',
 'Gross enrolment ratio, secondary, both sexes (%)',
 'Gross enrolment ratio, secondary, female (%)',
 'Gross enrolment ratio, secondary, male (%)',
 'Lower secondary completion rate, both sexes (%)',
 'Net enrolment rate, secondary, both sexes (%)',
 'Net enrolment rate, secondary, female (%)',
 'Net enrolment rate, secondary, male (%)',
 'Primary completion rate, both sexes (%)',
 'Primary completion rate, female (%)',
 'Survival rate to the last grade of primary education, both sexes (%)',
 'Lower secondary completion rate, female (%)',
 'Lower secondary completion rate, male (%)',
 'Survival rate to the last grade of primary education, female (%)',
 'Survival rate to the last grade of primary education, male (%)',
 'Survival rate to Grade 5 of primary education, female (%)',
 'Youth literacy rate, population 15-24 years, both sexes (%)',
 'Youth literacy rate, population 15-24 years, male (%)',
 'Teenage mothers (% of women ages 15-19 who have had children or are currently pregnant)',
 'Wanted fertility rate (births per woman)',
 'Access to electricity (% of population)',
 'Access to electricity, rural (% of rural population)',
 'Improved sanitation facilities (% of population with access)',
 'Improved sanitation facilities, rural (% of rural population with access)',
 'Improved sanitation facilities, urban (% of urban population with access)',
 'Improved water source (% of population with access)',
 'Improved water source, rural (% of rural population with access)',
 'Lifetime risk of maternal death (%)',
 'Maternal mortality ratio (modeled estimate, per 100,000 live births)',
 'Mortality rate, infant, female (per 1,000 live births)',
 'Mortality rate, infant, male (per 1,000 live births)',
 'Mortality rate, neonatal (per 1,000 live births)',
 'Mortality rate, under-5, female (per 1,000 live births)',
 'Mortality rate, under-5, male (per 1,000 live births)',
 'Prevalence of anemia among children (% of children under 5)',
 'Prevalence of anemia among pregnant women (%)',
 'Renewable energy consumption (% of total final energy consumption)',
 'Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)',
 'Cause of death, by non-communicable diseases (% of total)',
 'Survey mean consumption or income per capita, bottom 40% of population (2005 PPP $ per day)',
 'Survey mean consumption or income per capita, total population (2005 PPP $ per day)']

In [4]:
# 여기서부터 관련도 높은 feature 뽑는 것
target_id = 'Adolescent fertility rate (births per 1,000 women ages 15-19)';

target = data[data['IndicatorName']==target_id]

In [180]:
def merge_8(merged):
    for i in rel_columns_8:
        target = data[data['IndicatorName']==i]
        target.rename(columns={"Value": i}, inplace = True)
        target=target.drop(columns=['IndicatorCode', 'IndicatorName', 'CountryCode'])
        merged_bef = merged
        merged = pd.merge(merged, target)
        if len(merged) < 300:
            merged = merged_bef
    return merged


In [239]:
def merge_7(merged):
    for i in rel_columns_7:
        target = data[data['IndicatorName']==i]
        target.rename(columns={"Value": i}, inplace = True)
        target=target.drop(columns=['IndicatorCode', 'IndicatorName', 'CountryCode'])
        merged_bef = merged
        merged = pd.merge(merged, target)
        if len(merged) < 300:
            merged = merged_bef
    return merged

In [219]:
def make_grade(merged, k):
    min_value = min(merged['Value'])
    value=(max_value-min_value)/k
    for i in range(k):
        for j in range(len(merged)):
            if merged.loc[j, 'Value'] > min_value + value*i and merged.loc[j, 'Value'] < min_value + value*(i+1):
                merged.loc[j, 'grade'] = i
    return merged

In [53]:
# label encode categorical data (array)
def labelEncode_arr(y):
    '''
    label encoding array
    return value: label encoded array
    '''
    le = preprocessing.LabelEncoder()
    le.fit(y)
    y= le.transform(y)
    return y

In [54]:
merged['CountryName']= labelEncode_arr(merged['CountryName'])
merged['IndicatorName']= labelEncode_arr(merged['IndicatorName'])

In [55]:
def calcAccuracy(x,y):
    '''
    print accuracy of prediction
    '''
    correct=0
    for i in range(len(x)):
        predict_me= np.array(x[i].astype(float))
        predict_me = predict_me.reshape(-1, len(predict_me)) 
        prediction = kmeans.predict(predict_me)
        if prediction[0] == y[i]:
            correct += 1
    print(correct/len(x))

In [187]:
def preprocess_df_8(merged, k):
    merged=merge_8(merged)
    make_grade(merged, k)
    
    merged['CountryName']= labelEncode_arr(merged['CountryName'])
    
    return merged

In [228]:
def preprocess_df_7(merged, k):
    merged=merge_7(merged)
    make_grade(merged, k)
    
    merged['CountryName']= labelEncode_arr(merged['CountryName'])

    return merged

In [240]:
def predict_8(merged_8, k):
    merged_8 = data[data['IndicatorName']==target_id]
    merged_8 =preprocess_df_8(merged_8, k)
    
    merged_8['CountryName']= labelEncode_arr(merged_8['CountryName'])
    merged_8['IndicatorName']= labelEncode_arr(merged_8['IndicatorName'])
    
    kmeans= KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=800, 
               n_clusters=k, n_init=20, n_jobs=1, precompute_distances='auto', 
               random_state=None, tol=0.0001, verbose=0)

    X= np.array(merged_8.drop(['grade'], 1).astype(float))
    y=merged_8['grade']

    kmeans.fit(X)
    return calcAccuracy(X,y)

In [241]:
def predict_7(merged_7, k):
    merged_7 = data[data['IndicatorName']==target_id]
    merged_7 =preprocess_df_7(merged_7, k)
    
    merged_7['CountryName']= labelEncode_arr(merged_7['CountryName'])
    merged_7['IndicatorName']= labelEncode_arr(merged_7['IndicatorName'])
    
    kmeans= KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=800, 
               n_clusters=k, n_init=20, n_jobs=1, precompute_distances='auto', 
               random_state=None, tol=0.0001, verbose=0)

    X= np.array(merged_7.drop(['grade'], 1).astype(float))
    y=merged_7['grade']

    kmeans.fit(X)
    return calcAccuracy(X,y)

In [None]:
merged = data[data['IndicatorCode']==target]
predict_8(merged,5)

  result = method(y)


In [None]:
merged = data[data['IndicatorCode']==target]
predict_7(merged,5)

In [113]:
### rel_columns_7
kmeans= KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=800, 
               n_clusters=2, n_init=20, n_jobs=1, precompute_distances='auto', 
               random_state=None, tol=0.0001, verbose=0)
kmeans.fit(X)

calcAccuracy(X,y)

0.8460764587525151


In [None]:
'''
선택지
1. rel_columns_8 or 7
2. 몇개 미만이면 제외할지
3. kmeans parameter
'''