## Starter Notebook
EDA Notebook on Cryptcurrency Data
There are 26000+ csv files in the current version of the dataset:

In [510]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np
import os # for accessing directory structure
import pandas as pd
import glob
import zipfile

In [511]:
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#         print(os.path.join(dirname, filename))


# Above code will print it all like below, this was just for the initial checking
# I am commenting out as this folder has 26000+ file names to pring
# /kaggle/input/crypto-data/cr_20170822-152505.csv
# /kaggle/input/crypto-data/cr_20170812-020505.csv
# /kaggle/input/crypto-data/cr_20170813-065506.csv
# /kaggle/input/crypto-data/cr_20171005-012506.csv

# Defining this input variable as I will be using this in few places
file_dir = './kaggle/input/crypto-data/'

In [512]:
# Distribution graphs (histogram/bar graph) of column data
def plot_per_column_distribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()


In [513]:
# Correlation matrix
def plot_correlation_matrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()


In [514]:
# Scatter and density plots
def plot_scatter_matrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [515]:
# Define varable for the first 3 files

nRowsRead = 1000 # specify 'None' if want to read whole file
# These .csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df1 = pd.read_csv(file_dir+'cr_20170804-034052.csv', delimiter=',', nrows = nRowsRead)
df2 = pd.read_csv(file_dir+'cr_20170804-035004.csv', delimiter=',', nrows = nRowsRead)
df3 = pd.read_csv(file_dir+'cr_20170804-040006.csv', delimiter=',', nrows = nRowsRead)

In [516]:
# Let's check 1st file: /kaggle/input/crypto-data/cr_20170804-034052.csv
nRowsRead = 1000 # specify 'None' if want to read whole file
# cr_20170804-034052.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
# df1.dataframeName = 'cr_20170804-034052.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 1000 rows and 10 columns


In [517]:
# Let's take a quick look at what the data looks like:
df1.head(5)

Unnamed: 0,symbol,ranking by market cap,name,market cap,price,circulating supply,volume,% 1h,% 24h,% 1wk
0,GMB,1011,Gambleo,?,$0.000895,?,*,Low Vol,?,?
1,FTC,164,Feathercoin,"$9,481,236",$0.055794,169932600,"$1,267,140",-0.05%,10.49%,4.99%
2,HCC,953,Happy Creator...,?,$0.000108,?,*,Low Vol,-0.02%,8.39%
3,BRX,288,Breakout Stake,"$1,793,819",$0.286183,6268082,*,"$4,778",-4.53%,3.35%
4,UNRC,872,UniversalRoya...,?,$0.005857,?,*,$664,3.98%,-7.83%


Distribution graphs (histogram/bar graph) of sampled columns:

In [518]:
plot_per_column_distribution(df1, 10, 5)

<Figure size 2400x512 with 0 Axes>

In [519]:
# Let's check 2nd file: /kaggle/input/crypto-data/cr_20170804-035004.csv
df2.dataframeName = 'cr_20170804-035004.csv'
nRow, nCol = df2.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 1000 rows and 10 columns


In [520]:
# Let's take a quick look at what the data looks like:
df2.head(5)

Unnamed: 0,symbol,ranking by market cap,name,market cap,price,circulating supply,volume,% 1h,% 24h,% 1wk
0,XBY,190,XtraBYtes,"$7,422,935",$0.011420,650000000,*,"$16,273",-1.36%,-0.03%
1,CPN,637,CompuCoin,"$57,962",$0.002955,19615019,Low Vol,0.34%,-4.69%,-31.49%
2,CHEAP,976,Cheapcoin,?,$0.000112,?,*,Low Vol,0.02%,105.40%
3,BLAS,700,BlakeStar,"$27,044",$0.000112,242418240,Low Vol,0.02%,-5.80%,?
4,CPC,341,Capricoin,"$848,890",$0.426568,1990045,*,"$18,457",-0.49%,3.66%


Distribution graphs (histogram/bar graph) of sampled columns:

In [521]:
plot_per_column_distribution(df2, 10, 5)

<Figure size 2400x512 with 0 Axes>

In [522]:
# Let's check 3rd file: /kaggle/input/crypto-data/cr_20170804-040006.csv
df3.dataframeName = 'cr_20170804-040006.csv'
nRow, nCol = df3.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 1000 rows and 10 columns


In [523]:
# Let's take a quick look at what the data looks like:
df3.head(5)

Unnamed: 0,symbol,ranking by market cap,name,market cap,price,circulating supply,volume,% 1h,% 24h,% 1wk
0,XBY,192,XtraBYtes,"$7,322,055",$0.011265,650000000,*,"$16,282",-1.93%,-1.53%
1,CPN,638,CompuCoin,"$57,975",$0.002956,19615019,Low Vol,0.36%,-4.67%,-31.38%
2,CHEAP,978,Cheapcoin,?,$0.000112,?,*,Low Vol,0.17%,105.40%
3,BLAS,701,BlakeStar,"$27,038",$0.000112,242418240,Low Vol,0.05%,-7.01%,?
4,CPC,341,Capricoin,"$849,276",$0.426762,1990045,*,"$18,466",-0.51%,3.91%


In [524]:
# Distribution graphs (histogram/bar graph) of sampled columns:
plot_per_column_distribution(df3, 10, 5)

<Figure size 2400x512 with 0 Axes>

In [525]:
print(df1.shape)
print(df1.dtypes)

(1000, 10)
symbol                   object
ranking by market cap     int64
name                     object
market cap               object
price                    object
circulating supply       object
volume                   object
% 1h                     object
% 24h                    object
% 1wk                    object
dtype: object


In [526]:
# let's print all the files in the directory.
!ls $file_dir | wc -l

1416


In [527]:
# Define a variable for to hold a Python-list of .csv files in that directory
# files_list = glob.glob(os.path.join(file_dir, "*.csv"))
all_files = glob.glob(os.path.join(file_dir, "*.csv"))

# lets take the first 1400 .csv file (from which I shall create a combined-dataframe)
# Note in the original .zipped folder (uploaded to Kaggle) there are 26,000+ files.
# But for the sake of running this data in local file-system.
files_list = all_files[:1400]

# lets create dataframes and print them to see if it working

df1 = pd.read_csv(files_list[0])
df2 = pd.read_csv(files_list[1])
df3 = pd.read_csv(files_list[2])

print(df1.head(), "\n")
print(df2.head(), "\n")
print(df3.head(), "\n")

  symbol  ranking by market cap       name   market cap      price  \
0    XBY                    191  XtraBYtes  $8,144,825   $0.012530   
1    CPN                    670  CompuCoin     $54,084   $0.002757   
2  CHEAP                    933  Cheapcoin           ?   $0.000065   
3   BLAS                    464  BlakeStar     $47,259   $0.000195   
4    CPC                    353  Capricoin    $968,853   $0.486850   

  circulating supply   volume      % 1h   % 24h    % 1wk  
0        650,000,000        *   $17,479   6.94%   -3.80%  
1         19,615,019  Low Vol    11.35%   7.75%  -21.16%  
2                  ?        *   Low Vol   0.98%        ?  
3        242,813,280   $2,157     0.98%  35.40%        ?  
4          1,990,045        *  $125,312   0.98%    8.68%   

  symbol  ranking by market cap       name   market cap      price  \
0    XBY                    193  XtraBYtes  $8,181,420   $0.012587   
1    CPN                    677  CompuCoin     $56,184   $0.002864   
2  CHEAP     

All these files have the same columns so it seems reasonable to concatenate everything into one dataframe. However, I want to keep track of the file names because that's the only reference to the date of the records.

- First, creating a list of dataframes with the filenames in a "file_name" column
- Then concatenate them all into one big dataframe

In [528]:
dataframes = [pd.read_csv(file).assign(file_name=os.path.basename(file).strip(".csv")) for file in files_list]
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.head()

Unnamed: 0,symbol,ranking by market cap,name,market cap,price,circulating supply,volume,% 1h,% 24h,% 1wk,file_name
0,XBY,191,XtraBYtes,"$8,144,825",$0.012530,650000000,*,"$17,479",6.94%,-3.80%,r_20170806-153005
1,CPN,670,CompuCoin,"$54,084",$0.002757,19615019,Low Vol,11.35%,7.75%,-21.16%,r_20170806-153005
2,CHEAP,933,Cheapcoin,?,$0.000065,?,*,Low Vol,0.98%,?,r_20170806-153005
3,BLAS,464,BlakeStar,"$47,259",$0.000195,242813280,"$2,157",0.98%,35.40%,?,r_20170806-153005
4,CPC,353,Capricoin,"$968,853",$0.486850,1990045,*,"$125,312",0.98%,8.68%,r_20170806-153005


In [529]:
combined_df.shape

(1431260, 11)

In [530]:
# Creating a dataframe, by filtering only the data where the row is BTC
btc_df = combined_df[combined_df['symbol'] == 'BTC']
btc_df.shape

(1400, 11)

In [531]:
# btc_df.to_csv("combined-btc.csv")