# 데이터전처리

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [4]:
df = sns.load_dataset('titanic')

## 결측치처리

In [5]:
# NaN값 조회
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [9]:
# null이 아닌 데이터갯수 합
df.notnull().sum()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [10]:
df.notnull().sum(axis = 1)

0      14
1      15
2      14
3      15
4      14
       ..
886    14
887    15
888    13
889    15
890    14
Length: 891, dtype: int64

In [12]:
# for문으로 각 열 NaN갯수 계산

missing_df = df.isnull()

for col in missing_df.columns:
    missing_count = missing_df[col].value_counts()
    try:
        print(col, ':', missing_count[True])
    except:
        print(col, ':', missing_count[0])

survived : 891
pclass : 891
sex : 891
age : 177
sibsp : 891
parch : 891
fare : 891
embarked : 2
class : 891
who : 891
adult_male : 891
deck : 688
embark_town : 2
alive : 891
alone : 891


In [7]:
df['deck'].unique()

[NaN, 'C', 'E', 'G', 'D', 'A', 'B', 'F']
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [8]:
# deck열의 NaN갯수 계산기
df['deck'].value_counts(dropna = False)

NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: deck, dtype: int64

### 결측값 버리기

In [13]:
# Nan값이 500개 이상인 열 모두 삭제
# dropna는 하나만 없어도 삭제함. thresh값을 주어야함

df_tresh = df.dropna(axis = 1, thresh = 500, inplace = True)  

In [14]:
# age 열, 데이터가 없는 모든 행을 삭제

df_age = df.dropna(subset = ['age'], how = 'any', axis = 0)

### 결측값 대체하기

In [15]:
# 평균으로 대체

mean_age = df['age'].mean(axis = 0)    # Nan값 제외하고 mean 계산
df['age'].fillna(mean_age, inplace = True)

In [16]:
# 최빈값으로 대체

df['embark_town'].value_counts()
most_freq_one = df['embark_town'].value_counts(dropna = True).idxmax()
df['embark_town'].fillna(most_freq_one, inplace = True)

In [17]:
# 바로 앞의 값으로 변경

df['embarked'].fillna(method = 'ffill', inplace = True)

## 중복값 처리

### 중복값 찾기

In [19]:
df = pd.DataFrame({'c1':['a','a','b','a','b'],
                   'c2':[1,1,1,2,2],
                   'c3':[1,1,2,2,2]})
df.duplicated()

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [20]:
# 특정 열에서 데이터 중복값 찾기
df['c2'].duplicated()

0    False
1     True
2     True
3    False
4     True
Name: c2, dtype: bool

### 중복값 삭제

In [21]:
df.drop_duplicates()

Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2
4,b,2,2


In [22]:
df.drop_duplicates(subset = ['c2','c3'])

Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2


## 값 변환하기 conversion

In [23]:
auto = pd.read_csv('auto-mpg.csv', header = None)
auto.columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','name']

### 파생변수

In [24]:
# 파생변수
auto['kpl'] = auto['mpg']*0.425

# 반올림
auto['kpl'] = auto['kpl'].round(2)

### 누락데이터(?) 삭제

In [37]:
auto['horsepower'].replace('?', np.nan, inplace = True)           # ?값을 nan으로 변환
auto.dropna(subset = ['horsepower'], axis = 0, inplace = True)    # horsepower기준으로 nan값 빼줌

### 데이터타입 변경

In [38]:
auto['horsepower'] = auto['horsepower'].astype('float')

In [39]:
# 문자열 자료형을 범주형 자료형으로 변환
auto['origin'] = auto['origin'].astype('category')
auto['origin'].dtypes

CategoricalDtype(categories=[1, 2, 3], ordered=False)

### 데이터를 categoy형으로 나누기

In [40]:
# horsepower를 3개 등급으로 나누기
# np.histogram 함수로 3개의 bin으로 나누는 경계값 리스트 구하기
count, bin_dividers = np.histogram(auto['horsepower'], bins= 3)
bin_dividers, count


(array([ 46.        , 107.33333333, 168.66666667, 230.        ]),
 array([257, 103,  32], dtype=int64))

In [41]:
np.histogram(auto['horsepower'], bins = 3)

(array([257, 103,  32], dtype=int64),
 array([ 46.        , 107.33333333, 168.66666667, 230.        ]))

In [42]:
bin_name = ['low power', 'mid power', 'high power']

# cut 함수로 각 데이터 쪼개기
auto['hp_bin'] = pd.cut(x = auto['horsepower'],
                        bins = bin_dividers,    # 경계선 값 리스트
                        labels = bin_name,      # 각 구간 이름
                        include_lowest = True   # 첫 경계선값 포함하는 것으로 함
                        )
auto[['horsepower','hp_bin']].head(10)

Unnamed: 0,horsepower,hp_bin
0,130.0,mid power
1,165.0,mid power
2,150.0,mid power
3,150.0,mid power
4,140.0,mid power
5,198.0,high power
6,220.0,high power
7,215.0,high power
8,225.0,high power
9,190.0,high power


## 원핫인코딩

In [33]:
from sklearn import preprocessing

# 전처리를 위한 encoder객체 만들기
label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()

In [46]:
#label encoder로 문자열 범주를 숫자형 범주로 변환

onehot_labeled = label_encoder.fit_transform(auto['hp_bin'].head(15))
onehot_labeled

array([2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 0, 1])

In [48]:
# 2차원 행렬형태
onehot_reshaped = onehot_labeled.reshape(len(onehot_labeled), 1)
onehot_reshaped

array([[2],
       [2],
       [2],
       [2],
       [2],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [0],
       [1]])

In [50]:
# 희소행렬로 변환
onehot_fitted = onehot_encoder.fit_transform(onehot_reshaped)
print(onehot_fitted)

  (0, 2)	1.0
  (1, 2)	1.0
  (2, 2)	1.0
  (3, 2)	1.0
  (4, 2)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 2)	1.0
  (12, 2)	1.0
  (13, 0)	1.0
  (14, 1)	1.0


## TIME SERIES

In [51]:
df = pd.read_csv('stock-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    20 non-null     object
 1   Close   20 non-null     int64 
 2   Start   20 non-null     int64 
 3   High    20 non-null     int64 
 4   Low     20 non-null     int64 
 5   Volume  20 non-null     int64 
dtypes: int64(5), object(1)
memory usage: 1.1+ KB


In [52]:
df['new_date'] = pd.to_datetime(df['Date'])
df.head(3)

Unnamed: 0,Date,Close,Start,High,Low,Volume,new_date
0,2018-07-02,10100,10850,10900,10000,137977,2018-07-02
1,2018-06-29,10700,10550,10900,9990,170253,2018-06-29
2,2018-06-28,10400,10900,10950,10150,155769,2018-06-28


In [53]:
df.set_index('new_date', inplace = True)

### datetime

In [54]:
dates = ['2019-01-01','2020-03-01','2021-06-01']
ts_date = pd.to_datetime(dates)
ts_date

DatetimeIndex(['2019-01-01', '2020-03-01', '2021-06-01'], dtype='datetime64[ns]', freq=None)

### timestamp

In [58]:
pr_day = ts_date.to_period(freq = 'D')      # day
pr_month = ts_date.to_period(freq = 'M')    # month
pr_year = ts_date.to_period(freq = 'A')     # annual

In [60]:
pr_day, pr_month, pr_year

(PeriodIndex(['2019-01-01', '2020-03-01', '2021-06-01'], dtype='period[D]'),
 PeriodIndex(['2019-01', '2020-03', '2021-06'], dtype='period[M]'),
 PeriodIndex(['2019', '2020', '2021'], dtype='period[A-DEC]'))

In [57]:
# 월 간격, 월 시작일 기준
ts_m = pd.date_range(start = '2019-01-01',   # 날짜범위 시작
                   end = None,
                   periods = 6,              # 생성할 timestamp의 갯수
                   freq = 'MS',              # 시간간격: 월의 시작일
                   tz = 'Asia/Seoul')        # 시간, 지역
ts_m

DatetimeIndex(['2019-01-01 00:00:00+09:00', '2019-02-01 00:00:00+09:00',
               '2019-03-01 00:00:00+09:00', '2019-04-01 00:00:00+09:00',
               '2019-05-01 00:00:00+09:00', '2019-06-01 00:00:00+09:00'],
              dtype='datetime64[ns, Asia/Seoul]', freq='MS')

In [62]:
# 월간격, 월 마지막날 기준
pd.date_range(start = '2019-01-01', periods = 6, freq = "M", tz = 'Asia/Seoul')

DatetimeIndex(['2019-01-31 00:00:00+09:00', '2019-02-28 00:00:00+09:00',
               '2019-03-31 00:00:00+09:00', '2019-04-30 00:00:00+09:00',
               '2019-05-31 00:00:00+09:00', '2019-06-30 00:00:00+09:00'],
              dtype='datetime64[ns, Asia/Seoul]', freq='M')

In [61]:
# 3개월 기준

pd.date_range(start = '2019-01-01', periods = 6, freq = "3M", tz = 'Asia/Seoul')

DatetimeIndex(['2019-01-31 00:00:00+09:00', '2019-04-30 00:00:00+09:00',
               '2019-07-31 00:00:00+09:00', '2019-10-31 00:00:00+09:00',
               '2020-01-31 00:00:00+09:00', '2020-04-30 00:00:00+09:00'],
              dtype='datetime64[ns, Asia/Seoul]', freq='3M')

## Apply / Applymap / Map

In [63]:
titanic = sns.load_dataset('titanic')
df = titanic.loc[:,['age','fare']]

In [66]:
df.head(3)

Unnamed: 0,age,fare
0,22.0,7.25
1,38.0,71.2833
2,26.0,7.925


In [64]:
def add_10(x):
    return x+10
df.applymap(add_10)

Unnamed: 0,age,fare
0,32.0,17.2500
1,48.0,81.2833
2,36.0,17.9250
3,45.0,63.1000
4,45.0,18.0500
...,...,...
886,37.0,23.0000
887,29.0,40.0000
888,,33.4500
889,36.0,40.0000


In [67]:
def missing_value(series):
    return series.isnull()    # boolean 시리즈로 변환
df.apply(missing_value, axis = 0)

Unnamed: 0,age,fare
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
886,False,False
887,False,False
888,True,False
889,False,False


In [68]:
def min_max(x):
    return x.max() - x.min()
df.apply(min_max)

age      79.5800
fare    512.3292
dtype: float64

In [71]:
def add_two_obj(a,b):
    return a+b
df.apply(lambda x: add_two_obj(x['age'], x['fare']), axis = 1)

0       29.2500
1      109.2833
2       33.9250
3       88.1000
4       43.0500
         ...   
886     40.0000
887     49.0000
888         NaN
889     56.0000
890     39.7500
Length: 891, dtype: float64

In [72]:
filename = '서울시CCTV설치운영현황(자치구)_년도별_211231기준.csv'
cctv = pd.read_csv(filename, skiprows = 1, encoding = 'EUC-KR')

### . 없애기

In [74]:
cctv['총계'] = cctv['총계'].apply(lambda x: int(x.replace(',','')))

In [75]:
cctv['2012년'].fillna('0', inplace = True)
cctv['2013년'].fillna('0', inplace = True)
for col in cctv.columns[2:]:
    cctv[col] = cctv[col].apply(lambda x:int(x.replace(',','')))
cctv.head()

Unnamed: 0,구분,총계,2012년 이전,2012년,2013년,2014년,2015년,2016년,2017년,2018년,2019년,2020년,2021년
0,계,83557,4812,1851,3434,4295,6840,8708,11572,10627,12267,11247,7904
1,종로구,1715,815,0,0,195,150,0,261,85,9,200,0
2,중 구,2447,16,114,87,77,236,240,372,386,155,361,403
3,용산구,2611,34,71,234,125,221,298,351,125,307,617,228
4,성동구,3829,163,144,208,107,325,255,967,415,490,472,283


## 데이터프레임 연결

### CONCAT

In [76]:
# pd.concat(데이터프레임리스트, axis = 축)
df1 = pd.DataFrame([['a',1], ['b',2]], columns = ['letter', 'number'])
df2 = pd.DataFrame([['c',3], ['d',4]], columns = ['letter', 'number'])
df3 = pd.DataFrame([['e',5,'!'], ['f',6,'@']], columns = ['letter', 'number','etc'])

# 컬럼기준으로 연결
pd.concat([df1,df2,df3])    # 디폴트: axis = 0 컬럼기준

Unnamed: 0,letter,number,etc
0,a,1,
1,b,2,
0,c,3,
1,d,4,
0,e,5,!
1,f,6,@


In [None]:
# row명을 기준으로 연결
pd.concat([df1,df2,df3], axis = 1)   # axis = 1이면 row기준

# 공통된 컬럼만 남기기
pd.concat([df1,df2,df3], join = 'inner')   

# 인덱스 재지정
pd.concat([df1, df2, df3], join = 'inner', ignore_index = True)

df.reindex

# 모든 컬럼 남기기
pd.concat([df4, df5, df6], axis = 1, join = 'outer')

### MERGE

merge는 디폴트가 겹치는 것만 출력(Nan값 없음), concat은 전부다 묶어서 출력(Nan값 있을 수 있음)

In [None]:
pd.merge(sc1, sc2, on = 'name', how = 'inner')    # 디폴트가 inner

pd.merge(sc1, sc2, on = 'name', how = 'outer')    # outer 모두 다 표시

pd.merge(sc1, sc2, on = 'name', how = 'left')

pd.merge(sc1, sc2, on = 'name', how = 'right')

## 데이터 재구조화

### MELT

index가 쭉 나오고, 데이터 전체가 value값으로 출력됨, column 먼저

In [77]:
score = pd.read_csv('scores.csv')
score.head(2)

Unnamed: 0,name,kor,eng,math
0,Aiden,100.0,90.0,95.0
1,Charles,90.0,80.0,75.0


In [78]:
score.melt()

Unnamed: 0,variable,value
0,name,Aiden
1,name,Charles
2,name,Danial
3,name,Evan
4,name,Henry
...,...,...
115,math,95.0
116,math,100.0
117,math,100.0
118,math,70.0


In [79]:
pd.melt(score)      # score.melt()와 같은 명령어

Unnamed: 0,variable,value
0,name,Aiden
1,name,Charles
2,name,Danial
3,name,Evan
4,name,Henry
...,...,...
115,math,95.0
116,math,100.0
117,math,100.0
118,math,70.0


In [80]:
score.melt(id_vars = ['name'])    # 고정 컬럼 지정, name고정하고 melting

Unnamed: 0,name,variable,value
0,Aiden,kor,100.0
1,Charles,kor,90.0
2,Danial,kor,95.0
3,Evan,kor,100.0
4,Henry,kor,
...,...,...,...
85,Vanessa,math,95.0
86,Viviana,math,100.0
87,Vikkie,math,100.0
88,Winnie,math,70.0


In [81]:
score.melt(id_vars = 'name', value_vars = 'kor')   # value 변수 선택도 가능

Unnamed: 0,name,variable,value
0,Aiden,kor,100.0
1,Charles,kor,90.0
2,Danial,kor,95.0
3,Evan,kor,100.0
4,Henry,kor,
5,Ian,kor,90.0
6,James,kor,70.0
7,Julian,kor,80.0
8,Justin,kor,50.0
9,Kevin,kor,100.0


In [82]:
score.melt(id_vars = 'name', value_vars = ['kor','math'], var_name = 'subject', value_name = 'score')   

Unnamed: 0,name,subject,score
0,Aiden,kor,100.0
1,Charles,kor,90.0
2,Danial,kor,95.0
3,Evan,kor,100.0
4,Henry,kor,
5,Ian,kor,90.0
6,James,kor,70.0
7,Julian,kor,80.0
8,Justin,kor,50.0
9,Kevin,kor,100.0


### sorting해서 정렬

In [83]:
df = score.melt(id_vars = 'name', var_name = 'subject', value_name = 'score')

In [85]:
def get_grade(x):
    if x >= 90:
        grade = 'A'
    elif x >= 80:
        grade = 'B'
    elif x >= 70:
        grade = 'C'
    elif x >= 60:
        grade = 'D'
    else:
        grade = 'F'
    return grade

df['grade'] = df['score'].apply(get_grade)

In [87]:
df.sort_values('name', ascending = False)

Unnamed: 0,name,subject,score,grade
89,Zuly,math,95.0,A
59,Zuly,eng,90.0,A
29,Zuly,kor,80.0,B
88,Winnie,math,70.0,C
58,Winnie,eng,100.0,A
...,...,...,...,...
43,Amy,eng,75.0,C
13,Amy,kor,90.0,A
60,Aiden,math,95.0,A
30,Aiden,eng,90.0,A


### PIVOT

In [88]:
# dataframe, pivot(index = 인덱스로 사용할 컬럼, columns = 컬럼으로 사용할 컬럼, values = 값으로 사용할 컬럼)
df.pivot(index = 'name', columns = 'subject',  values = 'score')

subject,eng,kor,math
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aiden,90.0,100.0,95.0
Amy,75.0,90.0,90.0
Charles,80.0,90.0,75.0
Chloe,100.0,95.0,95.0
Danial,100.0,95.0,100.0
Danna,100.0,100.0,100.0
Ellen,60.0,,
Emma,65.0,70.0,70.0
Evan,100.0,100.0,100.0
Henry,35.0,,60.0


In [89]:
df.pivot(index = 'name', columns = 'subject',  values = ['score','grade']).head(10)

Unnamed: 0_level_0,score,score,score,grade,grade,grade
subject,eng,kor,math,eng,kor,math
name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Aiden,90.0,100.0,95.0,A,A,A
Amy,75.0,90.0,90.0,C,A,A
Charles,80.0,90.0,75.0,B,A,C
Chloe,100.0,95.0,95.0,A,A,A
Danial,100.0,95.0,100.0,A,A,A
Danna,100.0,100.0,100.0,A,A,A
Ellen,60.0,,,D,F,F
Emma,65.0,70.0,70.0,D,C,C
Evan,100.0,100.0,100.0,A,A,A
Henry,35.0,,60.0,F,F,D


### PIVOT_TABLE

In [92]:
df = pd.DataFrame({"item": ["shirts", "shirts", "shirts", "shirts", "shirts",
                          "pants", "pants", "pants", "pants"],
                    "color": ["white", "white", "white", "black", "black",
                          "white", "white", "black", "black"],
                   "size": ["small", "large", "large", "small",
                          "small", "large", "small", "small",
                         "large"],
                   "sale": [1, 2, 2, 3, 3, 4, 5, 6, 7],
                   "inventory": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
df.head(3)

Unnamed: 0,item,color,size,sale,inventory
0,shirts,white,small,1,2
1,shirts,white,large,2,4
2,shirts,white,large,2,5


In [91]:
df.pivot_table(index= 'item', columns = 'size', values = 'inventory', aggfunc = 'sum')

size,large,small
item,Unnamed: 1_level_1,Unnamed: 2_level_1
pants,15,17
shirts,9,13


In [93]:
df.pivot_table(index= ['item','color'], columns = 'size', values = 'inventory', aggfunc = 'sum')

Unnamed: 0_level_0,size,large,small
item,color,Unnamed: 2_level_1,Unnamed: 3_level_1
pants,black,9.0,9.0
pants,white,6.0,8.0
shirts,black,,11.0
shirts,white,9.0,2.0


### GROUPBY

In [99]:
titanic.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [94]:
titanic.groupby('pclass').survived.count()

pclass
1    216
2    184
3    491
Name: survived, dtype: int64

In [95]:
# 데이터프레임으로 변경
titanic.groupby('pclass').survived.count().to_frame()

Unnamed: 0_level_0,survived
pclass,Unnamed: 1_level_1
1,216
2,184
3,491


In [101]:
(titanic.groupby(['sex','pclass']).survived.mean().to_frame()).round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,pclass,Unnamed: 2_level_1
female,1,0.968
female,2,0.921
female,3,0.5
male,1,0.369
male,2,0.157
male,3,0.135


In [102]:
def my_mean(values):
    return sum(values) / len(values)

titanic.groupby(['sex','pclass']).survived.agg(my_mean).to_frame

<bound method Series.to_frame of sex     pclass
female  1         0.968085
        2         0.921053
        3         0.500000
male    1         0.368852
        2         0.157407
        3         0.135447
Name: survived, dtype: float64>

In [103]:
titanic.groupby(['sex','pclass']).survived.agg(my_mean).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,survived
sex,pclass,Unnamed: 2_level_1
female,1,0.968085
female,2,0.921053
female,3,0.5
male,1,0.368852
male,2,0.157407
male,3,0.135447
