In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
!pip install mplfinance
import mplfinance as mpf

In [None]:
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(35,10)})

# Getting first insights on data

## Loading in dataset

In [None]:
eth = pd.read_csv('../input/ethereum-data/ETH-USD.csv')
eth.head()

## Basic information on dataset

In [None]:
eth.shape

In [None]:
eth.info()

In [None]:
eth.describe()

## Converting Date column to Datatime and setting it as index

In [None]:
eth.Date = pd.to_datetime(eth.Date)


In [None]:
eth.info()

In [None]:
eth.set_index('Date', inplace=True)

In [None]:
eth.head()

## Finding missing values

In [None]:
eth[eth.isna().any(axis=1)]

Because there are 4 rows that contains every single value missing there are few possibilities. First we could drop those 4 rows and move on. Secondly, I could find those values and fill them. I will choose the second option and fill missing values from internet.

In [None]:
eth.loc['2020-10-07' :'2020-10-15', :]

data taken from: https://www.investing.com/crypto/ethereum/historical-data

In [None]:
eth.loc['2020-10-09', 'Open':'Volume'] = [350.71, 368.18, 347.82, 365.28, np.nan, 7850000]
eth.loc['2020-10-12', 'Open':'Volume'] = [374.39, 394.58, 366.05, 386.68, np.nan, 10390000]
eth.loc['2020-10-13', 'Open':'Volume'] = [386.64,387.06,374.57,381.32, np.nan, 7920000]
eth.loc['2020-04-17', 'Open':'Volume'] = [172.32,174.52,168.55,180.63, np.nan, 20050000]

In [None]:
eth.loc['2020-10-07' :'2020-10-15', :]

# Price Changes

In [None]:
mpf.plot(eth, type='line', style='yahoo', figratio=(35,10))

## Price Changes in 2017

In [None]:
mpf.plot(eth.loc['2017-05-12':'2017-08-12', :], type='candle', style='yahoo', figratio=(35,10))

## Price Changes in 2021

In [None]:
mpf.plot(eth.loc['2020-12-26':'2021-05-26', :], type='candle', style='yahoo', figratio=(35,10))

# Volume Traded

## Average Valume Traded Each Month

In [None]:
# Finding average valume traded each month
avg_vol = eth.groupby([eth.index.year,eth.index.month]).mean()

In [None]:
avg_vol.sort_values('Volume').tail()

In [None]:
avg_vol.index.names = ['Year', 'Month']
avg_vol.reset_index(inplace=True)

In [None]:
ax = sns.barplot(x='Month', y='Volume', hue='Year', data=avg_vol, palette=['#086623','#3F704D','#00A86B','#29AB87','#01796F','#4CBB17','#043927'])
ax.patch.set_facecolor('white')
ax.patch.set_alpha(1.0)

ax = plt.title('Average Volume Traded Each Month', fontsize=24, loc='center', fontweight='heavy', pad=32)

## Etheruem Volumes Traded

In [None]:
eth.head()

In [None]:
df = pd.pivot_table(eth, values='Volume', index=[eth.index.year, eth.index.month], aggfunc=np.sum)
df.index.names = ['Year','Month']
df.reset_index(inplace=True)

df['Date'] = df['Year'].astype(str) + '-' + df['Month'].astype(str)
df.drop(columns=['Year', 'Month'], inplace=True)

df['Date'] = pd.to_datetime(df.Date)
df['Date'] = df['Date'].dt.strftime('%Y-%m')


In [None]:
ax = sns.barplot(x='Date', y='Volume', data=df, color='#043927')
ax.patch.set_facecolor('white')
ax.patch.set_alpha(1.0)
ax = plt.xticks([])
ax = plt.title('ETH Volumes Traded Daily', fontsize=24, loc='center', fontweight='heavy', pad=32)

# Which day/month/year was most/less profitable to invest?

## Daily ROI

In [None]:
eth['Daily_ROI'] = (eth.Close/eth.Open*100)-100

In [None]:
eth.head()

In [None]:
ax = sns.lineplot(x=eth.index, y=eth.Daily_ROI, data=eth, color='#043927')
ax.patch.set_facecolor('white')
ax.patch.set_alpha(1.0)
ax = plt.xticks([])
ax = plt.title('ETH Daily Return of Investment', fontsize=24, loc='center', fontweight='heavy', pad=32)

### 5 Most Profitable Days to invest

In [None]:
eth.Daily_ROI.sort_values(ascending=False).head()

### 5 Least Profitable Days to invest

In [None]:
eth.Daily_ROI.sort_values().head()

## Monthly ROI

In [None]:
eth.reset_index(inplace=True)
eth.head()

In [None]:
grouped = eth.groupby([eth.Date.dt.year, eth.Date.dt.month])
monthly_roi = grouped.last().Close / grouped.first().Open * 100 - 100

In [None]:
monthly_roi.index.names = ['Year', 'Month']

In [None]:
monthly_roi = monthly_roi.to_frame()
monthly_roi.reset_index(inplace=True)
monthly_roi.head()

In [None]:
monthly_roi.columns = ['Year', 'Month', 'Monthly_ROI']
monthly_roi.head()

## Plotting monthly ROI

In [None]:
ax = sns.barplot(x='Month', y='Monthly_ROI', hue='Year', data=monthly_roi, palette=['#086623','#3F704D','#00A86B','#29AB87','#01796F','#4CBB17','#043927'])
ax.patch.set_facecolor('white')
ax.patch.set_alpha(1.0)
ax = plt.title('ETH Monthly Return of Investment', fontsize=24, loc='center', fontweight='heavy', pad=32)

### 5 Most Profitable Months to Invest

In [None]:
monthly_roi.sort_values('Monthly_ROI', ascending=False).head(5)

### 5 Least Profitable Months to Invest

In [None]:
monthly_roi.sort_values('Monthly_ROI').head(5)

## Yearly ROI

In [None]:
yearly_roi = eth.groupby(eth.Date.dt.year).last().Close / eth.groupby(eth.Date.dt.year).first().Open * 100 - 100

### 3 Most Profitable Years to Invest

In [None]:
yearly_roi.sort_values(ascending=False).head(3)

### 3 Least Profitable Years to Invest

In [None]:
yearly_roi.sort_values().head(3)

## Plotting yearly ROI

- keep in mind that it counts if you bought it at the first day in the year and sold on the last day of the year

In [None]:
ax = sns.barplot(x=yearly_roi.index, y=yearly_roi.values,palette=['#086623','#3F704D','#00A86B','#29AB87','#01796F','#4CBB17','#043927'])
ax.patch.set_facecolor('white')
ax.patch.set_alpha(1.0)
ax = plt.title('ETH Yearly Return of Investment', fontsize=24, loc='center', fontweight='heavy', pad=32)

The year 2017 had a huge bubble in crypro currencies space. We can see from graph above. Lets exclude it from graph to get a bigger picture at other years

In [None]:
excluded_2017 = yearly_roi[yearly_roi.index != 2017]
ax = sns.barplot(x=excluded_2017.index, y=excluded_2017.values, palette=['#086623','#3F704D', '#29AB87','#01796F','#4CBB17','#043927'])
ax.patch.set_facecolor('white')
ax.patch.set_alpha(1.0)
ax = plt.title('ETH Yearly Return of Investment [2017 Excluded]', fontsize=24, loc='center', fontweight='heavy', pad=32)

## Keep in Mind

ROI calculations was based on the assumption that you bought it when market opened and sold right when the market was closing. For monthly and yealy ROI assumptions were made that ETH was bought at the start of the Month/Year and sold at the end.

# Correlation

In [None]:
eth_corr = eth.loc[:,['Open','High', 'Low', 'Close', 'Volume']].corr()
eth_corr

In [None]:
sns.reset_orig()

In [None]:
ax = sns.heatmap(eth_corr, annot=True, cmap='Greens', fmt=".2f")

# Model Training

In [None]:
eth.drop(columns=['Adj Close', 'Daily_ROI'], inplace=True)
eth.columns

In [None]:
eth_sklearn = eth.loc[:,'Open':]
eth_sklearn

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

## LinearRegression (Without any optimizations)

In [None]:
X_1, y_1 = eth_sklearn.drop(columns=['High']), eth_sklearn.High

xtrain, xtest, ytrain, ytest = train_test_split(X_1, y_1, test_size=0.33, random_state=42)

xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

In [None]:
lr = LinearRegression()
lr.fit(xtrain, ytrain)

predicted_train = lr.predict(xtrain)
predicted_test = lr.predict(xtest)

print(f"RMSE on train data: {mean_squared_error(ytrain, predicted_train)**(0.5)}, R^2: {r2_score(ytrain, predicted_train)}")
print(f"RMSE on test data: {mean_squared_error(ytest, predicted_test)**(0.5)}, R^2: {r2_score(ytest, predicted_test)}")

In [None]:
lr.predict([[2530.94, 2515.26, 2594.81, 853190]]) #Current high price was 2636.21

In [None]:
lr.predict([[2461.08,2421.70,2532.19,507080]]) # Jul 31 high price was 2551.68

In [None]:
lr.predict([[2380.59,2320.80,2460.95,1003000]]) # Jul 30 high price was 2467.40

# Real values vs Predicted values

In [None]:
combined = pd.concat([xtrain,xtest], axis=0)

In [None]:
combined_high = np.concatenate((predicted_train, predicted_test), axis=0)

In [None]:
combined['High'] = combined_high
combined.sort_index(inplace=True)
combined.head()

In [None]:
combined['Date'] = eth.Date
combined.head()

In [None]:
eth.head()

In [None]:
sns.set(rc={'figure.figsize':(35,10)})

## Plotting

In [None]:
ax = sns.lineplot(x='Date', y='High', data=eth, color='#043927')
sns.lineplot(x='Date', y='High', data=combined, color='red')
ax.patch.set_facecolor('white')
ax.patch.set_alpha(1.0)
ax = plt.xticks([])
ax = plt.title('ETH Highest Price', fontsize=24, loc='center', fontweight='heavy', pad=32)

# Plotting zoomed in on two spikes

In [None]:
eth.set_index('Date', inplace=True)
combined.set_index('Date', inplace=True)

In [None]:
ax = sns.lineplot(x='Date', y='High', data=eth.loc['2017-05-12':'2017-08-12', :], color='#043927')
sns.lineplot(x='Date', y='High', data=combined.loc['2017-05-12':'2017-08-12', :], color='red')
ax.patch.set_facecolor('white')
ax.patch.set_alpha(1.0)
ax = plt.xticks([])
ax = plt.title('ETH Highest Price [2017-05-15 -> 2017-08-12]', fontsize=24, loc='center', fontweight='heavy', pad=32)

In [None]:
ax = sns.lineplot(x='Date', y='High', data=eth.loc['2020-12-26':'2021-05-26', :], color='#043927')
sns.lineplot(x='Date', y='High', data=combined.loc['2020-12-26':'2021-05-26', :], color='red')
ax.patch.set_facecolor('white')
ax.patch.set_alpha(1.0)
ax = plt.xticks([])
ax = plt.title('ETH Highest Price [2020-12-26 -> 2021-05-26]', fontsize=24, loc='center', fontweight='heavy', pad=32)

# Final toughts

Thanks [Arpit Verma](https://www.kaggle.com/varpit94) for this dataset that I could practise.

The things I could improve:
- Set a clear goals before starting workign with data
    - Questions
    - Assumptions
- Work more on data processing
- Hyperparameters tuning
- Write more about my process and the results that I got.

If anyone have any comments or suggestions please and thank you. I will read every single one.