In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [65]:
import os
csv_path = os.getenv("HOME") +"/workspace/aiffel/data/Pokemon.csv"
original_data = pd.read_csv(csv_path)
pokemon = original_data.copy()

### 파이썬 매직 라인
* `%matplotlib inline` 그래프를 브라우저 내부(이 경우엔 주피터 노트북 내부!)에 바로 출력되게 함.
* `config InlineBackend.figure_format = 'retina'`를 %matplotlib 뒤에 넣으면 그래프를 더 선명하게 출력할 수 있습니다.
* `config InlineBackend.figure_format = 'svg'`는 그래프를 SVG로 출력합니다.

### reset_index()
* default : `reset_index(drop=False)`
* `reset_index(drop=Ture)`: 원본 dataframe의 index를 삭제

In [68]:
pokemon.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [69]:
pokemon.tail()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True
799,721,Volcanion,Fire,Water,600,80,110,120,130,90,70,6,True


In [6]:
legendary = pokemon[pokemon["Legendary"] == True].reset_index(drop=True)
print(legendary.shape)
legendary.head()

(65, 13)


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,144,Articuno,Ice,Flying,580,90,85,100,95,125,85,1,True
1,145,Zapdos,Electric,Flying,580,90,90,85,125,90,100,1,True
2,146,Moltres,Fire,Flying,580,90,100,90,125,85,90,1,True
3,150,Mewtwo,Psychic,,680,106,110,90,154,90,130,1,True
4,150,MewtwoMega Mewtwo X,Psychic,Fighting,780,106,190,100,154,100,130,1,True


In [8]:
temp = pokemon[pokemon["Legendary"] == True].reset_index()
print(temp.shape)
temp.head()

(65, 14)


Unnamed: 0,index,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,156,144,Articuno,Ice,Flying,580,90,85,100,95,125,85,1,True
1,157,145,Zapdos,Electric,Flying,580,90,90,85,125,90,100,1,True
2,158,146,Moltres,Fire,Flying,580,90,100,90,125,85,90,1,True
3,162,150,Mewtwo,Psychic,,680,106,110,90,154,90,130,1,True
4,163,150,MewtwoMega Mewtwo X,Psychic,Fighting,780,106,190,100,154,100,130,1,True


In [70]:
temp_2 = pokemon[pokemon["Legendary"]==False].reset_index()
temp_2.head()

Unnamed: 0,index,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


위 데이터들을 다시 살펴봅시다. 우선 원본 데이터의 카피인 pokemon 데이터를 뽑아보았습니다. 컬럼으로 #, Name, Type1, Type2, Total, HP, Attack ... Legendary가 있습니다.

다음으로 할 것이 전설의 포켓몬인지 아닌지를 구분하여 각각 다른 변수에 담는 것입니다. pokemon의 `Legendary` 컬럼 값이 True인지 False인지에 따라 나눠서 담으면 될 것 같군요. Legendary 값이 참(True)인지 아닌지를 조건으로 각각 `legendary`와 `ordinary` 변수에 담아 봅시다. 노드에서 조건에 따라 변수를 나눌 때 `reset_index()`라는 메소드를 사용하네요. 이게 뭘까요?

* 원래 데이터인 pokemon의 0,1,2,3,4번째 인덱스는 전설의 포켓몬이 아닙니다. Legendary값이 False이니까요.
* 전설의 포켓몬은 전체 데이터 중 65개 뿐입니다.
* 그러면, pokemon 데이터의 위치(인덱스) 처음부터 순차적으로 뽑아서 legendaryd와 ordinary 변수에 담길 것이고, 따라서 legendary와 ordinary 변수에 담긴 데이터 순서는 원래 데이터인 pokemon과는 약간 차이가 있겠지요.
* **index값은 원래 데이터인 pokemon 변수의 인덱스 값입니다. 그러나 우리는 원래 데이터에서 몇 번째 위치였는지는 중요하지 않아요. 그래서 필요하지 않은 인덱스를 떨구기 위해 reset_index를 drop하는 것입니다!**

In [9]:
print(pokemon.iloc[156])

#                  144
Name          Articuno
Type 1             Ice
Type 2          Flying
Total              580
HP                  90
Attack              85
Defense            100
Sp. Atk             95
Sp. Def            125
Speed               85
Generation           1
Legendary         True
Name: 156, dtype: object


In [10]:
ordinary = pokemon[pokemon["Legendary"] == False].reset_index(drop=True)
print(ordinary.shape)
ordinary.head()

(735, 13)


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


## Pandas Series 사용

### (1)

In [13]:
pokemon["Name"]

0                  Bulbasaur
1                    Ivysaur
2                   Venusaur
3      VenusaurMega Venusaur
4                 Charmander
               ...          
795                  Diancie
796      DiancieMega Diancie
797      HoopaHoopa Confined
798       HoopaHoopa Unbound
799                Volcanion
Name: Name, Length: 800, dtype: object

In [14]:
pokemon.Name

0                  Bulbasaur
1                    Ivysaur
2                   Venusaur
3      VenusaurMega Venusaur
4                 Charmander
               ...          
795                  Diancie
796      DiancieMega Diancie
797      HoopaHoopa Confined
798       HoopaHoopa Unbound
799                Volcanion
Name: Name, Length: 800, dtype: object

In [17]:
print(type(pokemon["Name"]))
print(type(pokemon.Name))
print(type(pokemon))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


pandas Series와 dataframe의 차이는 무엇일까요ㅎㅎ pokemon을 출력해서 어떻게 생겼는지 보고, pokemon.Name을 출력해서 어떻게 생겼는지 보면 금방 감이 올 것 같아요. 아래 예시를 볼까요?

In [73]:
# dataframe
print(type(pokemon))
pokemon.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [74]:
print(type(pokemon['#']))
pokemon['#'].head()

<class 'pandas.core.series.Series'>


0    1
1    2
2    3
3    3
4    4
Name: #, dtype: int64

In [75]:
print(type(pokemon['Name']))
pokemon['Name'].head()

<class 'pandas.core.series.Series'>


0                Bulbasaur
1                  Ivysaur
2                 Venusaur
3    VenusaurMega Venusaur
4               Charmander
Name: Name, dtype: object

In [76]:
print(type(pokemon['HP']))
pokemon['HP'].head()

<class 'pandas.core.series.Series'>


0    45
1    60
2    80
3    80
4    39
Name: HP, dtype: int64

### (2) 조건에 맞으면 True 반환, 조건이 맞는 값만 출력

In [15]:
pokemon.Total

0      318
1      405
2      525
3      625
4      309
      ... 
795    600
796    700
797    600
798    680
799    600
Name: Total, Length: 800, dtype: int64

In [18]:
pokemon.Total > 500

0      False
1      False
2       True
3       True
4      False
       ...  
795     True
796     True
797     True
798     True
799     True
Name: Total, Length: 800, dtype: bool

In [19]:
pokemon[pokemon.Total > 500] #Total 값이 500 이상인 것만 출력(Total>500이 True)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
6,6,Charizard,Fire,Flying,534,78,84,78,109,85,100,1,False
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


In [20]:
pokemon[pokemon.Total == 500] #Total 컬럼의 값이 500인 것만 출력(Total==500이 True인 것만 출력)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
60,55,Golduck,Water,,500,80,82,78,95,80,85,1,False
70,65,Alakazam,Psychic,,500,55,50,45,135,95,120,1,False
84,78,Rapidash,Fire,,500,65,100,70,80,80,105,1,False
96,89,Muk,Poison,,500,105,105,75,65,100,50,1,False
101,94,Gengar,Ghost,Poison,500,60,65,60,130,75,110,1,False
132,123,Scyther,Bug,Flying,500,70,110,80,55,80,105,1,False
136,127,Pinsir,Bug,,500,65,125,100,55,70,85,1,False
201,186,Politoed,Water,,500,90,75,75,90,100,70,2,False
228,212,Scizor,Bug,Steel,500,70,130,100,55,80,65,2,False
231,214,Heracross,Bug,Fighting,500,80,125,75,40,95,85,2,False


In [21]:
pokemon[pokemon.Legendary == True] #Legendary 컬럼이 True인 것만 출력

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
156,144,Articuno,Ice,Flying,580,90,85,100,95,125,85,1,True
157,145,Zapdos,Electric,Flying,580,90,90,85,125,90,100,1,True
158,146,Moltres,Fire,Flying,580,90,100,90,125,85,90,1,True
162,150,Mewtwo,Psychic,,680,106,110,90,154,90,130,1,True
163,150,MewtwoMega Mewtwo X,Psychic,Fighting,780,106,190,100,154,100,130,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


### (3) 결측치 확인 : `isnull().sum()`

In [23]:
pokemon.isnull() #null값인 데이터가 있는지 없는지 출력. null값이 아닐 경우 False 반환.

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,False,False,False,False,False,False,False,False,False,False,False,False,False
796,False,False,False,False,False,False,False,False,False,False,False,False,False
797,False,False,False,False,False,False,False,False,False,False,False,False,False
798,False,False,False,False,False,False,False,False,False,False,False,False,False


In [22]:
pokemon.isnull().sum() #isnull()해서는 한 눈에 안 들어오네요. null값이 있는 것의 개수(sum)를 뽑아봅시다.

#               0
Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

pokemon에 담겨있는 데이터의 경우 Type 2 컬럼에만 null값(결측치, 데이터 손실)이 총 386개 있네요. Type 2에 대해서만 생각을 하면 되겠군요. 나머지 컬럼은 null값이 없습니다!
* 데이터 분석에서 null값은 적절하게 처리해줘야 합니다.

### (4) 고유한 값 찾기 : numpy `unique()`
위 메소드는 고유한 class(label 혹은 정답)의 개수가 몇 개인지, 혹은 class가 무엇이 있는지 찾을 때 유용하게 쓰입니다.

In [24]:
len(set(pokemon["#"]))

721

In [26]:
pokemon["#"].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

### (5) isna() vs isnull()

In [27]:
pokemon["Type 2"].isna()

0      False
1      False
2      False
3      False
4       True
       ...  
795    False
796    False
797    False
798    False
799    False
Name: Type 2, Length: 800, dtype: bool

In [30]:
pokemon["Type 2"].isnull()

0      False
1      False
2      False
3      False
4       True
       ...  
795    False
796    False
797    False
798    False
799    False
Name: Type 2, Length: 800, dtype: bool

In [28]:
pokemon["Type 2"].isna().sum()

386

In [29]:
pokemon["Type 2"].isnull().sum()

386

* 사실 `isna()`와 `isnull()`은 같은 메소드입니다ㅎㅎ
* [Stack overflow](https://stackoverflow.com/questions/52086574/pandas-isna-and-isnull-what-is-the-difference)에서는 isna가 isnull의 ailas(다른 표현)이라고 하네요.

In [36]:
print(pd.isnull)
print(pd.isna)

<function isna at 0x7f50d2c3c320>
<function isna at 0x7f50d2c3c320>


## 정규표현식
일반적으로 우리가 필요한 데이터는 '숫자' 혹은 '명사'의 형태가 대부분이며, 특수문자는 필요 없는 경우가 많습니다. 많은 경우 데이터 정제를 위해 특수문자를 제거하는데요, 여기서 정규표현식이 아주 유용하게 쓰입니다.

In [37]:
import re

* sub: 문자열 치환
* re.sub(a, b, data) : data 내의 a를 b로 변환
    * `>>> dbtb`
* findall : 패턴이 일치하는 모든 문자를 찾아서 리스트에 반환
* re.findall(pattern, string)
* compile : 찾고자 하는 문자열의 패턴을 정의
* 보통 정규표현식을 사용할 때는 `compile`을 통해 패턴을 정의한 후, 정의된 패턴과 매칭되는 경우를 찾아 다양한 처리(findall을 하든지, sub를 하든지...)를 합니다.

In [40]:
def cleanText(data):
    text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', data)
    return text

In [42]:
ori_text = "!@#abcdefghi$$$$%%jklmn!@#$%^&*()"
print("original text: ", ori_text)
print("after: ", cleanText(ori_text))

original text:  !@#abcdefghi$$$$%%jklmn!@#$%^&*()
after:  abcdefghijklmn


In [60]:
def findText(data):
    pass

In [61]:
def findNum(data):
    pattern = re.compile('[0-9]+\-[0-9]+\-[0-9]+')
    return pattern.findall(data)

In [62]:
num_text = "Dasol's phone number is 010-1111-2222 and Modulabs number is 010-2222-1234"
print("find phone number: ", findNum(num_text))

find phone number:  ['010-1111-2222', '010-2222-1234']


## Counter()

## One hot encoding