In [1]:
import numpy as np
import pandas as pd

In [2]:
raw_data = {'name' : ['kevin', 'saily', 'hoyeon', 'lux'], \
            'height' : [178.2, 162.9, 160.6, 156.2], \
            'gender' : ['male', 'female', 'female', None]} # None 아무것도 아닌 것 = NULL

pd_data = pd.DataFrame(raw_data)
pd_data.head()

Unnamed: 0,name,height,gender
0,kevin,178.2,male
1,saily,162.9,female
2,hoyeon,160.6,female
3,lux,156.2,


In [4]:
# dropna()로 결측치(None) 제거하기
filtered_data = pd_data.dropna() # axis = 0는 행, axis = 1은 컬럼
filtered_data.head()

Unnamed: 0,name,height,gender
0,kevin,178.2,male
1,saily,162.9,female
2,hoyeon,160.6,female


In [5]:
# del을 통해 특정 컬럼 제거하기.
del(filtered_data['name']) # filtered_data.drop('name', axis = 1)
filtered_data.head()

Unnamed: 0,height,gender
0,178.2,male
1,162.9,female
2,160.6,female


In [8]:
# 주어진 3개의 데이터 instance를 기반으로 데이터를 무작위 생성
female_mean = np.average(filtered_data[filtered_data['gender'] =='female']['height'].values) 
male_mean = np.average(filtered_data[filtered_data['gender'] =='male']['height'].values)

# type 중요! 인풋 - 결과값 -> [column]은 Series, .value 하면 nd.array로 반환!! 
print(type(filtered_data[filtered_data['gender'] =='female']['height'].values))
                       

print(female_mean, male_mean) # np는 floot 형(return) 짐작 가능. 

# 우리는 식에 대한 이해와 더불어, 속 깊은 의미에 대해 이해를 해야함. ex) value형에 대한 반환값 list, floot 등.. 이해!

<class 'numpy.ndarray'>
161.75 178.2


In [9]:
# fake data 생성
np.random.seed(0)
variance = 3
female_heights = variance * np.random.randn(200) + female_mean # broad casting 적용
male_heights = variance * np.random.randn(200) + male_mean # np.array로 반환  

# 차이
# list[0,0] * 2 = [0,0,0,0] 일때는 리스트 일때임!
# array[1,1] * 2 = [2,2] 일때는 array일 때임!

print(female_heights[:10])

# randn(m,n) : 평균 0, 편차 1의 가우시안 표준정규분포 난수를 matrix(m,n)생성 array 배열 
# * 3을 한 것은 

[167.04215704 162.95047163 164.68621395 168.4726796  167.35267397
 158.81816636 164.60026525 161.29592838 161.44034344 162.98179551]


In [10]:
generated_data = {'gender' : ['female'] * 200 + ['male'] * 200, 'height' : list(female_heights) + list(male_heights)}

filtered_data = filtered_data.append(pd.DataFrame(generated_data), ignore_index = True)
filtered_data.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,gender,height
0,male,178.2
1,female,162.9
2,female,160.6
3,female,167.042157
4,female,162.950472


In [11]:
filtered_data.shape

(403, 2)

In [13]:
filtered_data['gender'].value_counts()

female    202
male      201
Name: gender, dtype: int64

In [15]:
# 레이블 인코더 fit_transform 
from sklearn import preprocessing

le_gender = preprocessing.LabelEncoder()
final_data = filtered_data.copy()
filtered_data['gender'] = le_gender.fit_transform(filtered_data['gender'])

filtered_data.tail()

Unnamed: 0,gender,height
398,1,175.915523
399,1,180.773772
400,1,181.623306
401,1,182.599736
402,1,180.757656


#### Linear regression model 적용

In [20]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

features = ['height']

kf = KFold(n_splits = 5, shuffle = True) # 5등분, 셔플링 O

accrs = [] # accuracy 값
fold_idx = 1
for train_idx, test_idx in kf.split(final_data) :
    print("Fold {}".format(fold_idx))
    train_d, test_d = final_data.iloc[train_idx], final_data.iloc[test_idx] # 
    # type 을 확인해가면서 확인하기 !
    train_y = train_d['gender']
    train_x = train_d[features]
    
    test_y = test_d['gender']
    test_x = test_d[features]
    
    model = LinearRegression()
    model.fit(train_x, train_y)
    
    mean_accr = model.score(test_x, test_y) # 스코어 비교
    print(mean_accr)
    accrs.append(mean_accr)
    
    fold_idx += 1
    
print(np.average(accrs))

Fold 1
0.8775455225508926
Fold 2
0.8753348694452203
Fold 3
0.8724916681752231
Fold 4
0.8573155889559668
Fold 5
0.889475493349478
0.8744326284953562


In [22]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

features = ['height']

kf = KFold(n_splits = 5, shuffle = True) # 5등분, 셔플링 O

accrs = [] # accuracy 값
fold_idx = 1
for train_idx, test_idx in kf.split(final_data) :
    print("Fold {}".format(fold_idx))
    train_d, test_d = final_data.iloc[train_idx], final_data.iloc[test_idx] # 
    # type 을 확인해가면서 확인하기 !
    train_y = train_d['gender']
    train_x = train_d[features]
    
    test_y = test_d['gender']
    test_x = test_d[features]
    
    model = LogisticRegression(solver = 'lbfgs') # 최적화 알고리즘
    model.fit(train_x, train_y)
    
    mean_accr = model.score(test_x, test_y) # 스코어 비교
    print(mean_accr)
    accrs.append(mean_accr)
    
    fold_idx += 1
    
print(np.average(accrs))

Fold 1
1.0
Fold 2
1.0
Fold 3
1.0
Fold 4
1.0
Fold 5
1.0
1.0
