In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.cluster import KMeans
import sklearn.preprocessing as preprocessing


In [2]:
data = pd.read_csv('Indicators.csv')
data.shape

(5656458, 6)

In [60]:
data.head()

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value,name
0,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,2000,53.829472,Arab World_2000
1,Arab World,ARB,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,2000,28.282837,Arab World_2000
2,Arab World,ARB,"Cause of death, by communicable diseases and m...",SH.DTH.COMM.ZS,2000,34.960297,Arab World_2000
3,Arab World,ARB,"Cause of death, by non-communicable diseases (...",SH.DTH.NCOM.ZS,2000,56.536043,Arab World_2000
4,Arab World,ARB,"Mortality rate, under-5, female (per 1,000 liv...",SH.DYN.MORT.FE,2000,56.807549,Arab World_2000


In [94]:
# 여기서부터 관련도 높은 feature 뽑는 것
target_id = 'Adolescent fertility rate (births per 1,000 women ages 15-19)';

target = data[data['IndicatorName']==target_id]

In [6]:
rel_columns_8 = ['Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Birth rate, crude (per 1,000 people)',
 'Net enrolment rate, secondary, both sexes (%)',
 'Net enrolment rate, secondary, female (%)',
 'Net enrolment rate, secondary, male (%)',
 'Teenage mothers (% of women ages 15-19 who have had children or are currently pregnant)',
 'Mortality rate, under-5, female (per 1,000 live births)',
 'Mortality rate, under-5, male (per 1,000 live births)',
 'Prevalence of anemia among children (% of children under 5)',
 'Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)',
 'Cause of death, by non-communicable diseases (% of total)',
 'Survey mean consumption or income per capita, bottom 40% of population (2005 PPP $ per day)',
 'Survey mean consumption or income per capita, total population (2005 PPP $ per day)']

In [7]:
rel_columns_7 = ['Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Age dependency ratio (% of working-age population)',
 'Age dependency ratio, young (% of working-age population)',
 'Birth rate, crude (per 1,000 people)',
 'Fertility rate, total (births per woman)',
 'Life expectancy at birth, female (years)',
 'Life expectancy at birth, male (years)',
 'Life expectancy at birth, total (years)',
 'Mortality rate, adult, female (per 1,000 female adults)',
 'Mortality rate, infant (per 1,000 live births)',
 'Mortality rate, under-5 (per 1,000)',
 'Population, ages 0-14 (% of total)',
 'Population, ages 15-64 (% of total)',
 'Survival to age 65, female (% of cohort)',
 'Survival to age 65, male (% of cohort)',
 'Adjusted net enrolment rate, primary, both sexes (%)',
 'Adjusted net enrolment rate, primary, female (%)',
 'Gross enrolment ratio, secondary, both sexes (%)',
 'Gross enrolment ratio, secondary, female (%)',
 'Gross enrolment ratio, secondary, male (%)',
 'Lower secondary completion rate, both sexes (%)',
 'Net enrolment rate, secondary, both sexes (%)',
 'Net enrolment rate, secondary, female (%)',
 'Net enrolment rate, secondary, male (%)',
 'Primary completion rate, both sexes (%)',
 'Primary completion rate, female (%)',
 'Survival rate to the last grade of primary education, both sexes (%)',
 'Lower secondary completion rate, female (%)',
 'Lower secondary completion rate, male (%)',
 'Survival rate to the last grade of primary education, female (%)',
 'Survival rate to the last grade of primary education, male (%)',
 'Survival rate to Grade 5 of primary education, female (%)',
 'Youth literacy rate, population 15-24 years, both sexes (%)',
 'Youth literacy rate, population 15-24 years, male (%)',
 'Teenage mothers (% of women ages 15-19 who have had children or are currently pregnant)',
 'Wanted fertility rate (births per woman)',
 'Access to electricity (% of population)',
 'Access to electricity, rural (% of rural population)',
 'Improved sanitation facilities (% of population with access)',
 'Improved sanitation facilities, rural (% of rural population with access)',
 'Improved sanitation facilities, urban (% of urban population with access)',
 'Improved water source (% of population with access)',
 'Improved water source, rural (% of rural population with access)',
 'Lifetime risk of maternal death (%)',
 'Maternal mortality ratio (modeled estimate, per 100,000 live births)',
 'Mortality rate, infant, female (per 1,000 live births)',
 'Mortality rate, infant, male (per 1,000 live births)',
 'Mortality rate, neonatal (per 1,000 live births)',
 'Mortality rate, under-5, female (per 1,000 live births)',
 'Mortality rate, under-5, male (per 1,000 live births)',
 'Prevalence of anemia among children (% of children under 5)',
 'Prevalence of anemia among pregnant women (%)',
 'Renewable energy consumption (% of total final energy consumption)',
 'Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)',
 'Cause of death, by non-communicable diseases (% of total)',
 'Survey mean consumption or income per capita, bottom 40% of population (2005 PPP $ per day)',
 'Survey mean consumption or income per capita, total population (2005 PPP $ per day)']

In [84]:
# 해당하는 indicator해서 그 데이터 프레임 사용하기 편하게 바꿔주는거 
def transformDf(data, ind):
    data2000=data[(data['Year'] >= 2000) & (data['Year'] < 2015) ]
    col8 = data2000[(data2000['IndicatorName'].isin(ind))]
    col8=col8.reset_index(drop=True)
    
    name_list = list()

    for i in col8.index:
        name_list.append( col8.loc[i,'CountryName'] + "_" + str(col8.loc[i,'Year']))
    col8["name"] = name_list
    col8=col8.pivot_table(values="Value",index="name",columns="IndicatorName")
    return col8

    

In [101]:
df=transformDf(data,rel_columns_8)
df=df.dropna(subset=[target_id])

In [105]:
df.isna().sum()

IndicatorName
Adolescent fertility rate (births per 1,000 women ages 15-19)                                               0
Birth rate, crude (per 1,000 people)                                                                      233
Cause of death, by communicable diseases and maternal, prenatal and nutrition conditions (% of total)    2997
Cause of death, by non-communicable diseases (% of total)                                                2997
Mortality rate, under-5, female (per 1,000 live births)                                                  2971
Mortality rate, under-5, male (per 1,000 live births)                                                    2971
Net enrolment rate, secondary, both sexes (%)                                                            1691
Net enrolment rate, secondary, female (%)                                                                1715
Net enrolment rate, secondary, male (%)                                                                  1

In [103]:

def make_grade(merged, k):
    min_value = min(merged[target_id])
    max_value = max(merged[target_id])
    value=(max_value-min_value)/k
    for i in range(k):
        for j in merged.index:
            if merged.loc[j, target_id] > min_value + value*i and merged.loc[j, target_id] < min_value + value*(i+1):
                merged.loc[j, 'grade'] = i
    return merged


In [104]:
d=make_grade(df,2)
d.grade.unique()

array([ 1.,  0., nan])

In [70]:
"""
def merge_8(merged):
    for i in rel_columns_8:
        target = data[data['IndicatorName']==i]
        target.rename(columns={"Value": i}, inplace = True)
        target=target.drop(columns=['IndicatorCode', 'IndicatorName', 'CountryCode'])
        merged_bef = merged
        merged = pd.merge(merged, target)
        if len(merged) < 300:
            merged = merged_bef
    return merged
"""

'\ndef merge_8(merged):\n    for i in rel_columns_8:\n        target = data[data[\'IndicatorName\']==i]\n        target.rename(columns={"Value": i}, inplace = True)\n        target=target.drop(columns=[\'IndicatorCode\', \'IndicatorName\', \'CountryCode\'])\n        merged_bef = merged\n        merged = pd.merge(merged, target)\n        if len(merged) < 300:\n            merged = merged_bef\n    return merged\n'

In [52]:
"""
def merge_7(merged):
    for i in rel_columns_7:
        target = data[data['IndicatorName']==i]
        target.rename(columns={"Value": i}, inplace = True)
        target=target.drop(columns=['IndicatorCode', 'IndicatorName', 'CountryCode'])
        merged_bef = merged
        merged = pd.merge(merged, target)
        if len(merged) < 300:
            merged = merged_bef
    return merged
"""


'\ndef merge_7(merged):\n    for i in rel_columns_7:\n        target = data[data[\'IndicatorName\']==i]\n        target.rename(columns={"Value": i}, inplace = True)\n        target=target.drop(columns=[\'IndicatorCode\', \'IndicatorName\', \'CountryCode\'])\n        merged_bef = merged\n        merged = pd.merge(merged, target)\n        if len(merged) < 300:\n            merged = merged_bef\n    return merged\n'

In [71]:
"""
def make_grade(merged, k):
    min_value = min(merged['Value'])
    max_value = max(merged['Value'])
    value=(max_value-min_value)/k
    for i in range(k):
        for j in range(len(merged)):
            if merged.loc[j, 'Value'] > min_value + value*i and merged.loc[j, 'Value'] < min_value + value*(i+1):
                merged.loc[j, 'grade'] = i
    return merged
"""

"\ndef make_grade(merged, k):\n    min_value = min(merged['Value'])\n    max_value = max(merged['Value'])\n    value=(max_value-min_value)/k\n    for i in range(k):\n        for j in range(len(merged)):\n            if merged.loc[j, 'Value'] > min_value + value*i and merged.loc[j, 'Value'] < min_value + value*(i+1):\n                merged.loc[j, 'grade'] = i\n    return merged\n"

In [10]:
# label encode categorical data (array)
def labelEncode_arr(y):
    '''
    label encoding array
    return value: label encoded array
    '''
    le = preprocessing.LabelEncoder()
    le.fit(y)
    y= le.transform(y)
    return y

In [11]:
def calcAccuracy(kmeans, x,y):
    '''
    print accuracy of prediction
    '''
    correct=0
    for i in range(len(x)):
        predict_me= np.array(x[i].astype(float))
        predict_me = predict_me.reshape(-1, len(predict_me))
        prediction = kmeans.predict(predict_me)
        if prediction[0] == y[i]:
            correct += 1
    print(correct/len(x))

In [55]:
def preprocess_df_8(merged, k):
    merged=merge_8(merged)
    print(merged)
    make_grade(merged, k)
    
    merged['CountryName']= labelEncode_arr(merged['CountryName'])
    
    return merged

In [13]:
def preprocess_df_7(merged, k):
    merged=merge_7(merged)
    make_grade(merged, k)
    
    merged['CountryName']= labelEncode_arr(merged['CountryName'])

    return merged

In [137]:
def predict_8(merged_8, k):
    merged_8 = data[data['IndicatorName']==target_id]
    merged_8 =preprocess_df_8(merged_8, k)
    
    merged_8['CountryName']= labelEncode_arr(merged_8['CountryName'])
    merged_8['IndicatorName']= labelEncode_arr(merged_8['IndicatorName'])
    merged_8= merged_8.drop(columns=['CountryCode', 'IndicatorCode', 'name'])
    
    kmeans= KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=800, 
               n_clusters=k, n_init=20, n_jobs=1, precompute_distances='auto', 
               random_state=None, tol=0.0001, verbose=0)

    X= np.array(merged_8.drop(['grade'], 1).astype(float))
    y=merged_8['grade']

    print('bef', len(X[0]))
    kmeans.fit(X)
    return calcAccuracy(kmeans, X,y)

In [138]:
def predict_7(merged_7, k):
    merged_7 = data[data['IndicatorName']==target_id]
    merged_7 =preprocess_df_7(merged_7, k)
    
    merged_7['CountryName']= labelEncode_arr(merged_7['CountryName'])
    merged_7['IndicatorName']= labelEncode_arr(merged_7['IndicatorName'])
    merged_7= merged_7.drop(columns=['CountryCode', 'IndicatorCode', 'name'])

    kmeans= KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=800, 
               n_clusters=k, n_init=20, n_jobs=1, precompute_distances='auto', 
               random_state=None, tol=0.0001, verbose=0)

    X= np.array(merged_7.drop(['grade'], 1).astype(float))
    y=merged_7['grade']

    kmeans.fit(X)
    return calcAccuracy(kmeans, X,y)

In [139]:
predict_8(target,5)

bef 10
0.5003538570417552


In [None]:
predict_7(target,5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [69]:
merged=merge_8(data[data['IndicatorName']==target_id])
t=make_grade(merged, 2)


Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value,name,"Adolescent fertility rate (births per 1,000 women ages 15-19)","Birth rate, crude (per 1,000 people)","Net enrolment rate, secondary, both sexes (%)","Net enrolment rate, secondary, female (%)","Net enrolment rate, secondary, male (%)",Prevalence of anemia among children (% of children under 5),grade
10,Heavily indebted poor countries (HIPC),HPC,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,2000,131.718540,Heavily indebted poor countries (HIPC)_2000,131.718540,42.952911,17.257088,14.603803,19.892557,70.960171,1.0
15,Least developed countries: UN classification,LDC,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,2000,119.138024,Least developed countries: UN classification_2000,119.138024,38.604856,23.962164,21.846199,26.045223,68.058792,1.0
17,Low income,LIC,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,2000,125.876992,Low income_2000,125.876992,41.740544,17.811901,15.033191,20.574837,68.742138,1.0
28,Sub-Saharan Africa (all income levels),SSF,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,2000,128.676948,Sub-Saharan Africa (all income levels)_2000,128.676948,41.681171,20.800966,18.987701,22.596594,70.645349,1.0
29,Sub-Saharan Africa (developing only),SSA,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,2000,128.683435,Sub-Saharan Africa (developing only)_2000,128.683435,41.685870,20.792463,18.985275,22.582077,70.645572,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372,Madagascar,MDG,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,2011,125.128400,Madagascar_2011,125.128400,35.173000,29.812330,29.798660,29.826040,50.000000,1.0
1373,Malawi,MWI,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,2011,143.457600,Malawi_2011,143.457600,40.531000,28.977949,28.542721,29.407009,65.600000,1.0
1375,Mali,MLI,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,2011,178.709400,Mali_2011,178.709400,45.338000,34.488689,28.422960,40.279839,80.100000,1.0
1379,Mozambique,MOZ,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,2011,157.713800,Mozambique_2011,157.713800,40.590000,17.253139,16.657150,17.851070,66.500000,1.0


In [92]:
t.grade.unique()

array([ 0.,  1., nan])

In [18]:
### rel_columns_7
kmeans= KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=800, 
               n_clusters=2, n_init=20, n_jobs=1, precompute_distances='auto', 
               random_state=None, tol=0.0001, verbose=0)
kmeans.fit(X)

calcAccuracy(X,y)

NameError: name 'X' is not defined

In [None]:
'''
선택지
1. rel_columns_8 or 7
2. 몇개 미만이면 제외할지
3. kmeans parameter
'''