In [49]:
import os
import pandas as pd
import numpy as np

In [None]:
base_src = "./drive/MyDrive/Python/machine_learning_data"

In [None]:
os.listdir(base_src)

['friend.csv', 'new_friend.csv', 'new_friend_index_true.csv', 'abalone.data']

In [None]:
abalone_src = base_src + '/abalone.data'

In [None]:
abalone_df = pd.read_csv(base_src+'/abalone.data',
             header=None,sep=',',
             names=['sex','length','diameter','height',
                  'whole_weight','shucked_weight','viscera_weight',
                  'shell_weight','rings'])
abalone_df

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [None]:
# 데이터 shape를 확인
# 딥러닝(CNN) => 이미지 (데이터수, 가로, 세로, 흑백(1), 컬러(3))
# 딥러닝(RNN) => 텍스트,시계열 (데이터수,시간,세로)
abalone_df.shape

(4177, 9)

In [None]:
# 데이터 결측값 확인
abalone_df.isnull().sum().sum()

0

In [None]:
abalone_df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [None]:
# 기술통계 확인 => 연속형 변수만 확인 가능 => 즉, numerical 변수만 확인 가능하다.
abalone_df.describe()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [None]:
# 전복(abalone) 성별에 따라 groupby함수를 통해 집계를 하겠습니다.
# ex)       DataFrame[집계 변수].groupby(DataFrame[집계 대상])
grouped = abalone_df['whole_weight'].groupby(abalone_df['sex'])

In [None]:
grouped.sum()

sex
F    1367.8175
I     578.8885
M    1514.9500
Name: whole_weight, dtype: float64

In [None]:
grouped.mean() # => 이상치도 반응한다.

sex
F    1.046532
I    0.431363
M    0.991459
Name: whole_weight, dtype: float64

In [None]:
grouped.size() # 몇 개있는지 확인

sex
F    1307
I    1342
M    1528
Name: whole_weight, dtype: int64

In [None]:
# 그룹변수가 하나가 아닌, 전체 연속형 변수에 대한 집계
abalone_df.groupby(abalone_df['sex']).mean()

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,0.579093,0.454732,0.158011,1.046532,0.446188,0.230689,0.30201,11.129304
I,0.427746,0.326494,0.107996,0.431363,0.191035,0.09201,0.128182,7.890462
M,0.561391,0.439287,0.151381,0.991459,0.432946,0.215545,0.281969,10.705497


In [None]:
# 다음과 같이 간단하게 표현
abalone_df.groupby('sex').mean()

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,0.579093,0.454732,0.158011,1.046532,0.446188,0.230689,0.30201,11.129304
I,0.427746,0.326494,0.107996,0.431363,0.191035,0.09201,0.128182,7.890462
M,0.561391,0.439287,0.151381,0.991459,0.432946,0.215545,0.281969,10.705497


In [None]:
abalone_df

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [None]:
# 새로운 조건에 맞는 변수 추가
abalone_df['length_bool'] = np.where(abalone_df['length']>abalone_df['length'].median(),
                                     'length_long',     # True일 경우
                                     'length_short')    # False일 경우

In [None]:
# 그룹변수를 2개 이상 선택해서 총계처리한다면?
abalone_df.groupby(['sex','length_bool']).mean() # 이 정보 과정들이 EDA

Unnamed: 0_level_0,Unnamed: 1_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,length_bool,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
F,length_long,0.626895,0.49302,0.169944,1.26133,0.542957,0.276945,0.360013,11.415073
F,length_short,0.477428,0.373301,0.132632,0.589702,0.24038,0.132311,0.17865,10.521531
I,length_long,0.584495,0.452952,0.150957,0.923215,0.402524,0.196912,0.273247,10.585106
I,length_short,0.40221,0.305893,0.100997,0.351234,0.156581,0.07492,0.104549,7.451473
M,length_long,0.623359,0.489291,0.16867,1.255182,0.554312,0.272203,0.351683,11.299172
M,length_short,0.454875,0.353336,0.121664,0.538157,0.224335,0.118156,0.162141,9.685053


In [None]:
# 만약 전체 다 안보고 whole_weight만 보겠다면?
# 간결하게 표현할 수 있다.
abalone_df.groupby(['sex','length_bool'])['whole_weight'].mean()

sex  length_bool 
F    length_long     1.261330
     length_short    0.589702
I    length_long     0.923215
     length_short    0.351234
M    length_long     1.255182
     length_short    0.538157
Name: whole_weight, dtype: float64

In [None]:
# 중복 데이터 삭제 결측치를 확인했다면 => 중복된 데이터도 확인하기!
# 중복된 row를 확인하는 법
abalone_df.duplicated().sum() # 여기서 isnull 했을 때 전부 다 확인 못할 때 .sum() 하면된다.
# 그렇다면 중복된 것은 하나도 없다라고 나온다.

0

In [None]:
# 중복 예제 생성을 위해서 가상으로 중복데이터 생성
new_abalone = abalone_df.iloc[[0]]
# concat()은 concatenate의 약자, 단순히 데이터를 연결하는 것
new_abalone_df = pd.concat([abalone_df,new_abalone],axis=0)
new_abalone_df

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,length_bool
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,length_short
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,length_short
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,length_short
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,length_short
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,length_short
...,...,...,...,...,...,...,...,...,...,...
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,length_long
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,length_long
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,length_long
4176,M,0.710,0.555,0.195,1.9485,0.9455,0.3765,0.4950,12,length_long


In [None]:
new_abalone_df.duplicated().sum()

1

In [None]:
# 뒤에것은 살려두고 첫번째를 없애고 싶다면?
new_abalone_df.duplicated(keep='last')

0        True
1       False
2       False
3       False
4       False
        ...  
4173    False
4174    False
4175    False
4176    False
0       False
Length: 4178, dtype: bool

In [None]:
# 중복 데이터(row 삭제)
new_abalone_df.drop_duplicates(keep='last')
# 여기서 중복데이터가 삭제되었네 하고 new_abalone_df를 찍어보면 삭제되어있지않다.
# 그 이유는 변수 할당을 안해줬기 때문이다!
# => new_abalone_df = new_abalone_df.drop_duplicates()

# 뒤에것이 아닌 앞에것을 없애고 싶다면 똑같이 keep='last'를 해준다.
# new_abalone_df.drop_duplicates(keep='last')

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,length_bool
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,length_short
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,length_short
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,length_short
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,length_short
5,I,0.425,0.300,0.095,0.3515,0.1410,0.0775,0.1200,8,length_short
...,...,...,...,...,...,...,...,...,...,...
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,length_long
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,length_long
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,length_long
4176,M,0.710,0.555,0.195,1.9485,0.9455,0.3765,0.4950,12,length_long


In [None]:
# NaN(결측치)를 찾아서 다른 값으로 변경
# 기존 데이터에는 결측치가 존재하지 않았다.
abalone_df.isnull().sum().sum()

0

In [None]:
# 가상으로 결측치를 만들어본다.
# 원본 데이터는 남겨둬야 하기때문에 copy를 해야한다.
nan_abalone_df = abalone_df.copy()

In [None]:
nan_abalone_df.loc[2,'length'] = np.nan

In [None]:
# 즉, 전체 데이터에서 결측치가 하나 존재한다는 뜻이다.
nan_abalone_df.isnull().sum().sum()

1

In [None]:
# 결측치를 특정 값으로 채우기
# 결측치를 0으로 채우겠다는
zero_abalone_df = nan_abalone_df.fillna(0)
# 나는 0을 넣었지만, 데이터 값은 0.000이 나왔다. 그 이유는?
# 실수가 정수를 이기기 때문이다. 실수 > 정
zero_abalone_df

In [None]:
# 결측치를 결측치가 속한 컬럼의 평균값으로 대체하기
nan_abalone_df.mean()

In [None]:
nan_abalone_df.fillna(nan_abalone_df.mean())

In [None]:
# 꽃 ...!
# apply함수 !!!!! 진짜 강추 완전 중요!!!! 이해하기
# apply 함수 활용
# DataFrame타입의 객체에서 호출가능한 apply함수에 대해 살펴보자.
# 본인이 원하는 행과 열에 연산 혹은 function을 적용할 수 있다.
# 열 기준으로 집계하고 싶은 경우 axis=0
# 행 기준으로 집계하고 싶은 경우 axis=1

In [53]:
# apply함수를 적용하면 이런식으로도 할 수 있다는 것을 보여준다.
abalone_df[['diameter']].mean()

diameter    0.407881
dtype: float64

In [52]:
# 열 기준 집계
abalone_df[['diameter']].apply(np.average,axis=0)

diameter    0.407881
dtype: float64

In [58]:
# 행 기준 집계
abalone_df[['diameter','whole_weight']].apply(np.average,axis=1)

0       0.43950
1       0.24525
2       0.54850
3       0.44050
4       0.23000
         ...   
4172    0.66850
4173    0.70300
4174    0.82550
4175    0.78975
4176    1.25175
Length: 4177, dtype: float64

In [63]:
# 사용자 함수를 통한 집계 ★★★★★ 무조건 기억하기!!!
import math
def avg_ceil(x,y,z):
    return math.ceil((x+y+z)) # ceil은 올림이다. 평균을 올려주는 것 ex) 3.7=4, 3.1=4

abalone_df[['diameter','height','whole_weight']].apply(lambda x: avg_ceil(x[0],x[1],x[2]),axis=1)


0       1
1       1
2       2
3       2
4       1
       ..
4172    2
4173    2
4174    2
4175    2
4176    3
Length: 4177, dtype: int64

In [66]:
# 문제
'''
1. 사용자 정의 함수 사용
2. ['diameter','height','whole_weight'] 변수 사용
3. 세 변수의 합이 1이 넘으면 True, 아니면 False 출력 후 answer 변수에 저장
4. abalone_df에 answer 열을 추가하고 입력
'''
def f(x): # 사용자 정의 함수!! 정말 중요하다.
    sum_ = x[0]+x[1]+x[2]
    if sum_>1:
        return True
    else:
        return False

answer = abalone_df[['diameter','height','whole_weight']].apply(lambda x: f(x),axis=1)
abalone_df['answer'] = answer

In [67]:
abalone_df

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,length_bool,answer
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,length_short,False
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,length_short,False
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,length_short,True
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,length_short,True
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,length_short,False
...,...,...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,length_long,True
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,length_long,True
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,length_long,True
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,length_long,True


In [71]:
# 컬럼내 유니크한 값 뽑아서 갯수 확인하기 (카테고리 변수에 사용)
abalone_df['sex'].value_counts()
# 여기서 value_counts를 구하는 동시에 정렬까지 하고싶다면?
abalone_df['sex'].value_counts(ascending=True)
# 결측치를 제외한 나머지를 출력하고싶다면?
abalone_df['sex'].value_counts(dropna=True)

sex
M    1528
I    1342
F    1307
Name: count, dtype: int64

In [73]:
# 두 개의 DataFrame 합치기
# 가상 abalone 1개 row데이터 생성 및 결합
one_abalone_df = abalone_df.iloc[[0]]
pd.concat([abalone_df,one_abalone_df],axis=0)

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,length_bool,answer
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,length_short,False
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,length_short,False
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,length_short,True
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,length_short,True
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,length_short,False
...,...,...,...,...,...,...,...,...,...,...,...
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,length_long,True
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,length_long,True
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,length_long,True
4176,M,0.710,0.555,0.195,1.9485,0.9455,0.3765,0.4950,12,length_long,True


In [76]:
# 컬럼으로 붙이기
one_abalone_df = abalone_df.iloc[:,[0]]
pd.concat([abalone_df,one_abalone_df],axis=1)

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,length_bool,answer,sex.1
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,length_short,False,M
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,length_short,False,M
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,length_short,True,F
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,length_short,True,M
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,length_short,False,I
...,...,...,...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,length_long,True,F
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,length_long,True,M
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,length_long,True,M
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,length_long,True,F
