# Series/DataFrame 만들기

<hr>

In [1]:
import pandas as pd
import numpy as np

## Series 
<pre>
        data=list or arraylist,
        index=None,
        dtype: Dtype | None = None,
        name=None,

### list

In [2]:
mylist=[1,2,3,'A',np.nan]
s=pd.Series(data=mylist, index=range(5),name="seq")
s

0      1
1      2
2      3
3      A
4    NaN
Name: seq, dtype: object

### array

In [3]:
#array는 리스트를 만들어 형변환을 시키는 것
mylist=[1,2,3,'A',np.nan]
myarr=np.array(mylist)
myarr #array(['1', '2', '3', 'A', 'nan'], dtype='<U32')
print(myarr) #['1' '2' '3' 'A' 'nan']
print(myarr.shape) #(5,): 1행 5열 **, 5개의 데이터를 가지고있다 / .shape : 모양 확인 

['1' '2' '3' 'A' 'nan']
(5,)


In [4]:
mylist=[1,2,3,'A',np.nan]
myarr=np.array(mylist)
s=pd.Series(data=myarr, index=range(5),name="seq")
print(s.shape) #(5,) : array로 series 만들었기때문에 array 개수를 사용
print(len(s)) #5 : row 개수

(5,)
5


<span style = "color : red"><b> 1. shape() - 함수 X <br>
<span style = "color : red"><b> 2. mylist.shape - 리스트는 shape 없음

### list + list

In [5]:
mylist=[[1,2,3],[10,20,30]]
print(mylist) #[[1, 2, 3], [10, 20, 30]]
#print(mylist.shape) : 리스트는 shape 없음
print(len(mylist), len(mylist[0]))
      
myarr=np.array(mylist)
print(myarr) #[[ 1  2  3]
             #[10 20 30]]
print(myarr.shape)    

[[1, 2, 3], [10, 20, 30]]
2 3
[[ 1  2  3]
 [10 20 30]]
(2, 3)


## DataFrame
<pre>
- head(), tail(), shape, describe()

-       data=None,
        index: Axes | None = None,
        columns: Axes | None = None,
        dtype: Dtype | None = None,

### list or array

In [6]:
mylist=[[1,2,3],[10,20,30]]
myarr=np.array(mylist) 

#df=pd.DataFrame(data=myarr)
df=pd.DataFrame(data=mylist, columns=['a','b','c'], index = [10,20])
print(df.shape) #(2, 3): 2행 3열
print(len(df)) #row 개수
df

(2, 3)
2


Unnamed: 0,a,b,c
10,1,2,3
20,10,20,30


### Series

In [7]:
s = pd.Series([1,2,3,4])
df =pd.DataFrame(['a','b','c','d'], columns=['code'])
df['seq']=s
df['sal']=[100,200,300,400]
df[['id','pw']] = np.array([['kim',11]]*4)
df['tel']='000' #모든 값이 000으로 삽입
                #없는 컬럼이면 열 추가, 있는 컬럼이면 값 변경
df

Unnamed: 0,code,seq,sal,id,pw,tel
0,a,1,100,kim,11,0
1,b,2,200,kim,11,0
2,c,3,300,kim,11,0
3,d,4,400,kim,11,0


### list + dic [{}]

- dict에 사용 된 키 값이 dataframe의 컬럼명이 된다

In [8]:
listdic = [{"uid":"kim","upw":111},
      {"uid":"kim","upw":111}]
df=pd.DataFrame(listdic) #딕셔너리에 사용된 키 값이 컬럼 이름이 됨
df

Unnamed: 0,uid,upw
0,kim,111
1,kim,111


### list + list

In [9]:
listlist = [["kim",111],
      ["kim",111]]
df=pd.DataFrame(listlist, columns=['a','b']) 
df

Unnamed: 0,a,b
0,kim,111
1,kim,111


## DataFrame 구조 보기


### index

In [10]:
df =pd.DataFrame(['a','b','c','d'], columns=['code'], index=[10,20,30,40])
df.index #인덱스가 안보이게 하는 방법은 없음
print(df.index) #Int64Index([10, 20, 30, 40], dtype='int64')
print(df.index.values) # 인덱스의 값만 보이기 :[10 20 30 40]
print(df.index.values.tolist()) #리스트 형태로 변환 : [10, 20, 30, 40]

Int64Index([10, 20, 30, 40], dtype='int64')
[10 20 30 40]
[10, 20, 30, 40]


In [11]:
df =pd.DataFrame(['a','b','c','d'], columns=['code'])
df.index = [10,20,30,40] # 인덱스 별도 지정
df

Unnamed: 0,code
10,a
20,b
30,c
40,d


### columns

In [12]:
s = pd.Series([1,2,3,4])
df =pd.DataFrame(['a','b','c','d'], columns=['code'])
df['seq']=s
df['sal']=[100,200,300,400]
print(df.columns)
print(df.columns.values)
print(df.columns.values.tolist())

Index(['code', 'seq', 'sal'], dtype='object')
['code' 'seq' 'sal']
['code', 'seq', 'sal']


### values

In [13]:
s = pd.Series([1,2,3,4])
df =pd.DataFrame(['a','b','c','d'], columns=['code'])
df['seq']=s
df['sal']=[100,200,300,400]


print(df.values)  #df의 값만 빼오기 
print(df.values.tolist()) #array --> list 

[['a' 1 100]
 ['b' 2 200]
 ['c' 3 300]
 ['d' 4 400]]
[['a', 1, 100], ['b', 2, 200], ['c', 3, 300], ['d', 4, 400]]


### info()

In [14]:
df.info() #데이터프레임의 사이즈, 인덱스, 컬럼 등의 정보 출력

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   code    4 non-null      object
 1   seq     4 non-null      int64 
 2   sal     4 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 224.0+ bytes


### head()

In [15]:
#df.head(5)
df.head(1) # 앞에서부터 1행 출력

Unnamed: 0,code,seq,sal
0,a,1,100


### tail()

In [16]:
df.tail(1) # 뒤에서부터 1행 출력

Unnamed: 0,code,seq,sal
3,d,4,400


### describe

In [17]:
 df.describe() #통계치 요약 정보

Unnamed: 0,seq,sal
count,4.0,4.0
mean,2.5,250.0
std,1.290994,129.099445
min,1.0,100.0
25%,1.75,175.0
50%,2.5,250.0
75%,3.25,325.0
max,4.0,400.0


# SELECT in DataFrame
- loc[줄,칸] : 값으로 꺼내기
- iloc[줄,칸] : 인덱스로 꺼내기
- [줄,칸]
     - 단일값
     - 리스트: [단일값, 단일값, 단일값]
     - 슬라이싱[ : ]

## 파일꺼내오기

In [18]:
df=pd.read_csv("./lec08_emp.csv", sep=",", parse_dates=['HIREDATE']) #parse_dates : 날짜 타입으로 변환
df.head(14)

Unnamed: 0,EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
0,7369,SMITH,CLERK,7902.0,1980-12-17,800,,20
1,7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300.0,30
2,7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500.0,30
3,7566,JONES,MANAGER,7839.0,1981-04-02,2975,,20
4,7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250,1400.0,30
5,7698,BLAKE,MANAGER,7839.0,1981-05-01,2850,,30
6,7782,CCC,MANAGER,7839.0,1981-06-09,2450,,10
7,7788,SCOTT,ANALYST,7566.0,1987-07-13,3000,,20
8,7839,KING,PRESIDENT,,1981-11-17,5000,,10
9,7844,TURNER,SALESMAN,7698.0,1981-09-08,1500,0.0,30


<font color= 'red'><b> 
    * 숫자값에 nan이 있으면 타입은 float(소수점)

In [19]:
df.info()
#RangeIndex: 14 entries : 총 레코드 개수
#4 non-null : 4개 제외한 나머지 결측

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   EMPNO     14 non-null     int64         
 1   ENAME     14 non-null     object        
 2   JOB       14 non-null     object        
 3   MGR       13 non-null     float64       
 4   HIREDATE  14 non-null     datetime64[ns]
 5   SAL       14 non-null     int64         
 6   COMM      4 non-null      float64       
 7   DEPTNO    14 non-null     int64         
dtypes: datetime64[ns](1), float64(2), int64(3), object(2)
memory usage: 1.0+ KB


## loc

## iloc

# 결측(missing value)처리
* np.nan
* df.dropna(axis=1 또는 axis=0)
* df.fillna(값)
* pd.isna(df)
* 데이터의 이상치와 결측을 잘 처리해야 학습이 잘 된다

## df.dropna
- axis=0 :행(레코드)
- axis=1 :열(컬럼)

In [20]:
dfcp=df.copy()
#dfcp= dfcp.dropna(axis=0)  # drop하고 난 후 dfcp에 적용시켜라
                            # dfcp.dropna(axis=0) 만 하면 메모리에서만 지워지는 것
dfcp.dropna(axis=0,inplace=True) #inplace=True : 덮어쓰기   

In [21]:
dfcp

Unnamed: 0,EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
1,7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300.0,30
2,7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500.0,30
4,7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250,1400.0,30
9,7844,TURNER,SALESMAN,7698.0,1981-09-08,1500,0.0,30


In [22]:
dfcp=df.copy()
dfcp.dropna(axis=1,inplace=True)
dfcp

Unnamed: 0,EMPNO,ENAME,JOB,HIREDATE,SAL,DEPTNO
0,7369,SMITH,CLERK,1980-12-17,800,20
1,7499,ALLEN,SALESMAN,1981-02-20,1600,30
2,7521,WARD,SALESMAN,1981-02-22,1250,30
3,7566,JONES,MANAGER,1981-04-02,2975,20
4,7654,MARTIN,SALESMAN,1981-09-28,1250,30
5,7698,BLAKE,MANAGER,1981-05-01,2850,30
6,7782,CCC,MANAGER,1981-06-09,2450,10
7,7788,SCOTT,ANALYST,1987-07-13,3000,20
8,7839,KING,PRESIDENT,1981-11-17,5000,10
9,7844,TURNER,SALESMAN,1981-09-08,1500,30


## df.fillna(inplace=True)
- 결측 채우기 후 일반적으로 형변환을 한다.<br>
<font color= 'red'><b> 
    - df['컬럼명']=df['컬럼명'].astype('타입')

In [23]:
dfcp=df.copy()
dfcp=dfcp.fillna(99999999)
#dfcp.fillna(99999999,inplace=True)
dfcp

Unnamed: 0,EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
0,7369,SMITH,CLERK,7902.0,1980-12-17,800,99999999.0,20
1,7499,ALLEN,SALESMAN,7698.0,1981-02-20,1600,300.0,30
2,7521,WARD,SALESMAN,7698.0,1981-02-22,1250,500.0,30
3,7566,JONES,MANAGER,7839.0,1981-04-02,2975,99999999.0,20
4,7654,MARTIN,SALESMAN,7698.0,1981-09-28,1250,1400.0,30
5,7698,BLAKE,MANAGER,7839.0,1981-05-01,2850,99999999.0,30
6,7782,CCC,MANAGER,7839.0,1981-06-09,2450,99999999.0,10
7,7788,SCOTT,ANALYST,7566.0,1987-07-13,3000,99999999.0,20
8,7839,KING,PRESIDENT,99999999.0,1981-11-17,5000,99999999.0,10
9,7844,TURNER,SALESMAN,7698.0,1981-09-08,1500,0.0,30


In [24]:
dfcp=df.copy()
#dfcp=dfcp.fillna(99999999)
# dfcp['COMM']=dfcp['COMM'].fillna(8888)
# dfcp['MGR']=dfcp['MGR'].fillna(77)

dfcp[['COMM','MGR']]=dfcp[['COMM','MGR']].fillna(99999)
dfcp[['COMM','MGR']]=dfcp[['COMM','MGR']].fillna(99999).astype('int') #astype('int'): int 타입으로 변환
dfcp.head(3)

Unnamed: 0,EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
0,7369,SMITH,CLERK,7902,1980-12-17,800,99999,20
1,7499,ALLEN,SALESMAN,7698,1981-02-20,1600,300,30
2,7521,WARD,SALESMAN,7698,1981-02-22,1250,500,30


##  df.isna / pd.isna(df1)
- df에 결측이 있으면 true

In [25]:
dfcp=df.copy()
#pandas 것
#pd.isna(dfcp)
#dfcp.isna() 
#dfcp.notna() : nan이 아닌 것 true

#numpy 것
#dfcp.isnull() 
#dfcp.notnull()

In [26]:
dfcp.isna().sum()

EMPNO        0
ENAME        0
JOB          0
MGR          1
HIREDATE     0
SAL          0
COMM        10
DEPTNO       0
dtype: int64