In [1]:
import numpy as np
import pandas as pd

# 📚 과제 1: 데이터프레임 Basics 

In [2]:
transactions = pd.read_csv('./data/retail/transactions.csv')
transactions

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


In [3]:
transactions.shape

(83488, 3)

In [4]:
transactions.index.max()

83487

In [5]:
transactions.dtypes

date            object
store_nbr        int64
transactions     int64
dtype: object

# 📚 과제 2: 데이터프레임 탐색

In [6]:
transactions.head(10)

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903
6,2013-01-02,6,2143
7,2013-01-02,7,1874
8,2013-01-02,8,3250
9,2013-01-02,9,2940


In [7]:
transactions.isna()

Unnamed: 0,date,store_nbr,transactions
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
83483,False,False,False
83484,False,False,False
83485,False,False,False
83486,False,False,False


In [8]:
transactions.isna().sum(axis=0)

date            0
store_nbr       0
transactions    0
dtype: int64

In [9]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          83488 non-null  object
 1   store_nbr     83488 non-null  int64 
 2   transactions  83488 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.9+ MB


In [10]:
transactions.describe()

Unnamed: 0,store_nbr,transactions
count,83488.0,83488.0
mean,26.939237,1694.602158
std,15.608204,963.286644
min,1.0,5.0
25%,13.0,1046.0
50%,27.0,1393.0
75%,40.0,2079.0
max,54.0,8359.0


# 📚 과제 3: 데이터프레임 조회

In [11]:
# transactions.loc[1:, ['store_nbr', 'transactions']]
transactions.loc[1:, 'store_nbr':'transactions']

Unnamed: 0,store_nbr,transactions
1,1,2111
2,2,2358
3,3,3487
4,4,1922
5,5,1903
...,...,...
83483,50,2804
83484,51,1573
83485,52,2255
83486,53,932


In [12]:
transactions['store_nbr'].unique()

array([25,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 23, 24, 26, 27, 28, 30, 31, 32, 33, 34, 35, 37, 38, 39,
       40, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 54, 36, 53, 20, 29, 21,
       42, 22, 52])

In [13]:
transactions['store_nbr'].nunique()

54

In [14]:
transactions['transactions'].sum() / 1000000

141.478945

# 📚 과제 4: 데이터프레임 중복

In [15]:
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [16]:
transactions.drop(0, axis=0, inplace=True)
transactions.head()

Unnamed: 0,date,store_nbr,transactions
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903


In [17]:
transactions.drop('date', axis=1).head()

Unnamed: 0,store_nbr,transactions
1,1,2111
2,2,2358
3,3,3487
4,4,1922
5,5,1903


In [18]:
transactions.drop_duplicates(subset='store_nbr', keep='last').head()

Unnamed: 0,date,store_nbr,transactions
83434,2017-08-15,1,1693
83435,2017-08-15,2,1737
83436,2017-08-15,3,2956
83437,2017-08-15,4,1283
83438,2017-08-15,5,1310


# 📚 과제 5: 데이터프레임 결측값

In [19]:
oil = pd.read_csv('./data/retail/oil.csv')
oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [20]:
# 결측값 카운팅1
oil.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        1218 non-null   object 
 1   dcoilwtico  1175 non-null   float64
dtypes: float64(1), object(1)
memory usage: 19.2+ KB


In [21]:
# 결측값 카운팅2
oil.isna().sum(axis=0)

date           0
dcoilwtico    43
dtype: int64

In [22]:
oil.loc[:, 'dcoilwtico'].fillna(0).mean()

65.32379310344828

In [23]:
(oil.loc[:, 'dcoilwtico']
 .fillna(
     oil.loc[:, 'dcoilwtico'].mean()
 )
 .mean()
)

67.71436595744682

# 📚 과제 6: 데이터프레임 필터링

In [24]:
transactions

Unnamed: 0,date,store_nbr,transactions
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


In [25]:
(transactions['transactions'] > 2000).mean()

0.266808006036868

In [26]:
mask = (
    (transactions['transactions'] > 2000)
    & (transactions['store_nbr'] == 25)
)

In [27]:
(
    transactions.loc[mask, 'transactions'].count()
    / transactions.loc[transactions['store_nbr'] == 25, 'transactions'].count()
)

0.03469640644361834

In [28]:
transactions.loc[mask, 'transactions'].sum()

144903

In [29]:
transactions.query(
    "store_nbr in [25, 31] and date.str[6] in ['5', '6'] and transactions < 2000"
).sum()['transactions']

644910