# Data Importing and Cleaning

___

### Import Libraries

In [18]:
import pandas as pd

### Import Data Files

In [19]:
# import daily data, dates 4/29/2013 - 4/1/21
daily = pd.read_csv('./data/coin_Bitcoin.csv')

# check df
daily.head()

Unnamed: 0,Date,High,Low,Open,Close,Volume
0,4/29/13 23:59,147.488007,134.0,134.444,144.539993,0.0
1,4/30/13 23:59,146.929993,134.050003,144.0,139.0,0.0
2,5/1/13 23:59,139.889999,107.720001,139.0,116.989998,0.0
3,5/2/13 23:59,125.599998,92.281898,116.379997,105.209999,0.0
4,5/3/13 23:59,108.127998,79.099998,106.25,97.75,0.0


### DateTime

___
Change the `Date` column in both DataFrames to `datetime` and set as the index.

In [20]:
# change date col in daily
daily['Date'] = pd.to_datetime(daily['Date'])

# rename date col to lowercase
daily.rename(columns={'Date': 'date'}, inplace=True)

In [21]:
# confirm the changes
daily.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2895 entries, 0 to 2894
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    2895 non-null   datetime64[ns]
 1   High    2895 non-null   float64       
 2   Low     2895 non-null   float64       
 3   Open    2895 non-null   float64       
 4   Close   2895 non-null   float64       
 5   Volume  2891 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 135.8 KB


In [22]:
# change date to index in daily and sort by index
daily.set_index("date", inplace=True)
daily.sort_index(inplace=True)

In [23]:
daily.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-04-29 23:59:00,147.488007,134.0,134.444,144.539993,0.0
2013-04-30 23:59:00,146.929993,134.050003,144.0,139.0,0.0
2013-05-01 23:59:00,139.889999,107.720001,139.0,116.989998,0.0
2013-05-02 23:59:00,125.599998,92.281898,116.379997,105.209999,0.0
2013-05-03 23:59:00,108.127998,79.099998,106.25,97.75,0.0


### Cleaning

___
In this section, the DataFrames were inspected to see if any features could be dropped, any duplicate or missing observations, and the columns were renamed.

In [24]:
# check for duplicate rows in daily
daily[daily.duplicated()]

Unnamed: 0_level_0,High,Low,Open,Close,Volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [25]:
# rename daily cols to snake case
daily.rename(columns={
    'High': 'high',
    'Low': 'low',
    'Open': 'open',
    'Close': 'close',
    'Volume': 'volume'
}, inplace=True)

In [26]:
daily.head()

Unnamed: 0_level_0,high,low,open,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-04-29 23:59:00,147.488007,134.0,134.444,144.539993,0.0
2013-04-30 23:59:00,146.929993,134.050003,144.0,139.0,0.0
2013-05-01 23:59:00,139.889999,107.720001,139.0,116.989998,0.0
2013-05-02 23:59:00,125.599998,92.281898,116.379997,105.209999,0.0
2013-05-03 23:59:00,108.127998,79.099998,106.25,97.75,0.0


### Export DataFrame

In [27]:
# export daily df
daily.to_csv('./data/daily_clean.csv')