In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns # visualization tool

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#get the data from csv file to dataframe

data = pd.read_csv('/kaggle/input/bitcoin-historical-data/bitstampUSD_1-min_data_2012-01-01_to_2020-09-14.csv')

In [None]:
# looking for data columns to have an idea about data type and data content

data.info()

In [None]:
#to arrange the data columns name 
data = data.rename(columns={"Volume_(BTC)": "Volume_BTC", "Volume_(Currency)": "Volume_Currency"})
data.info()

In [None]:
# to make lower_case all_columns

data.columns= data.columns.str.lower()
data.columns


In [None]:
# to see first five data in dataframe

data.head() # if you want to see more or less than five data, you need to use number in brackets..like->data.head(10) 

# data.tail() # to see last five data

In [None]:
#there are null(NaN) values and we need to clean the missing data

data = data.dropna(how='any',axis=0)
data

In [None]:
data.info()

In [None]:
# encode the date to period column and delete timestamp column

data['period'] = pd.to_datetime(data['timestamp'],unit='s').dt.to_period('M') # monthly period
data = data.drop(["timestamp"],axis=1)   # column drop with column name
data

In [None]:
#data = data.groupby('period')
#data.head()

data = data.groupby('period').agg({'weighted_price': ['mean'], 'volume_btc': ['sum'], 'volume_currency': ['sum']})
data.columns = ['weighted_price_mean', 'volume_btc_sum', 'volume_currency_sum']
data.head(10)

"""
grouped_multiple = data.groupby(['period', 'weighted_price']).agg({'volume_btc': ['mean', 'min', 'max']})
grouped_multiple.columns = ['volume_btc_mean', 'volume_btc_min', 'volume_btc_max']
grouped_multiple = grouped_multiple.reset_index()
grouped_multiple
"""

In [None]:
# Line Plot
# color = color, label = label, linewidth = width of line, alpha = opacity, grid = grid, linestyle = sytle of line
#data.volume_btc_sum.plot(color = 'g',label = 'volume_btc_sum',linewidth=1, alpha = 0.5,grid = True,linestyle = ':')
#data.volume_currency_sum.plot(color = 'r',label = 'volume_currency_sum',linewidth=1, alpha = 0.5,grid = True,linestyle = '-.')
data.weighted_price_mean.plot(kind = 'line', color = 'g',label = 'weighted_price_mean',linewidth=1,alpha = 0.9,grid = True,linestyle = ':')
plt.legend(loc='upper right')     # legend = puts label into plot
plt.xlabel('x axis')              # label = name of label
plt.ylabel('y axis')
plt.title('Line Plot')            # title = title of plot
plt.show()

In [None]:
# Scatter Plot 
# as you can in the Scatter Plot, we can say when volume currency increase, weighted price also increse

# x = volume_currency_sum, y = weighted_price_mean
data.plot(kind='scatter', x='volume_currency_sum', y='weighted_price_mean',alpha = 0.5,color = 'red')
plt.xlabel('volume_currency_sum')  # label = name of label
plt.ylabel('weighted_price_mean') 
plt.title('volume_currency_sum & weighted_price_mean Scatter Plot')            # title = title of plot
plt.show()



In [None]:
# Histogram
# bins = number of weighted_price_mean in figure 
data.weighted_price_mean.plot(kind = 'hist',bins = 50,figsize = (10,5))
plt.show()

In [None]:
#data filtering for price > 11K

x = data['weighted_price_mean']>11000
data[x]

In [None]:
#data filtering with more than one conditions

data[np.logical_and(data['weighted_price_mean']>5, data['weighted_price_mean']<10)]

In [None]:
# Correlation map 
# We can say volume_currency_sum and weighted_price_mean are positively correlated (when values close to 1)
# And there is no negative correlation (when values close to -1)

f,ax = plt.subplots(figsize=(18,18))
sns.heatmap(data.corr(),annot=True,linewidths=.5,fmt='.1f',ax=ax)
plt.show()

In [None]:
# AND SOME EXAMPLES FOR DICTIONARY, PANDAS series and dataframe, COMPARISON, WHILE AND FOR LOOPS

In [None]:
#create dictionary and look its keys and values
dictionary = {'spain' : 'madrid','usa' : 'vegas'}
print(dictionary.keys())
print(dictionary.values())

In [None]:
# Keys have to be immutable objects like string, boolean, float, integer or tubles
# List is not immutable
# Keys are unique

dictionary['spain'] = "barcelona"    # update existing entry
print(dictionary)


dictionary['france'] = "paris"       # Add new entry
print(dictionary)

del dictionary['spain']              # remove entry with key 'spain'
print(dictionary)

print('france' in dictionary)        # check include or not, returns boolean

dictionary.clear()                   # remove all entries in dict
print(dictionary)


In [None]:
#del dictionary         # delete entire dictionary     

print(dictionary)       # when delete ::: it gives error because dictionary is deleted

In [None]:
#PANDAS series and dataframe 

series = data['weighted_price_mean']        # data['weighted_price_mean'] = series
print(type(series))

data_frame = data[['weighted_price_mean']]  # data[['weighted_price_mean']] = data frame
print(type(data_frame))

print(series)
print(data_frame)

In [None]:
# Comparison operator
print(5 > 2)
print(1!=2)


# Boolean operators
print(True and False)
print(True or False)

In [None]:
# WHILE and FOR LOOPS
# Stay in loop if condition( i is not equal 5) is true

i = 0
while i != 5 :
    print('i is: ',i)
    i +=1
print(i,' is equal to 5')



for j in range(5):
    print('j is:',j)
    j+=1
print(j,' is equal to 5')


list1 = [0,1,2,3,4]
for i in list1:
    print('i is: ',i)
print(i,' is equal to 5')


In [None]:
# Enumerate index and value of list
# index : value = 0:1, 1:2, 2:3, 3:4, 4:5
list2 = [1,2,3,4,5]
for index, value in enumerate(list2):
    print(index," : ",value)
print('')   

# For dictionaries
# We can use for loop to achive key and value of dictionary. We learnt key and value before with examples
dictionary = {'spain':'madrid','france':'paris'}
for key,value in dictionary.items():
    print(key," : ",value)
print('')

# For pandas we can achieve index and value
for index,value in data[['weighted_price_mean']][0:2].iterrows():
    print(index," : ",value)

In [None]:
# USER DEFINED FUNCTION
"""
tuple: sequence of immutable python objects.
cant modify values
tuple uses paranthesis like tuble = (1,2,3)
unpack tuple into several variables like a,b,c = tuple
"""

def tuple_ex():
    """ return defined t tuple"""
    t = (data.agg({'weighted_price_mean': ['min']}),data.agg({'weighted_price_mean': ['max']}))
    return t

min_mean,max_mean = tuple_ex()

print(min_mean)
print(max_mean)



In [None]:
#NESTED function
def min_max_average():
    
    def tuple_ex():
        """ return defined t tuple"""
        t = data.weighted_price_mean.min(),data.weighted_price_mean.max()
        print("t is a tuple and values are : ",t)
        return t

    min_mean,max_mean = tuple_ex()
    print("minimum mean is : ", min_mean)
    print("maximum mean is : ", max_mean)
    
    return (min_mean+max_mean)/2

print("Average price is : ", min_max_average())
  

In [None]:
# flexible arguments *args --> we can send any count of parameters
def f(*args):
    for i in args:
        print(i)
        
f(data.weighted_price_mean.sum()) #we can send one or more(below) parameters
print(" ")
f(data.weighted_price_mean.min(),data.weighted_price_mean.max(),data.weighted_price_mean.mean())
print("")


# flexible arguments **kwargs that is dictionary --> --> Again we can send any count of parameters
def g(**kwargs):
    """ print key and value of dictionary"""
    i = 0
    for key, value in kwargs.items():  
        i = i+1
        print(i)
        print(key, " ", value)
        if i==3: #as you can see there is no 3 output data, all of it fetch and after that writes one time
            break

g(montly_weighted_price_mean = data.weighted_price_mean.head(10))

In [None]:
# lambda function
square = lambda x: x**2     # where x is name of argument
print(square(5))
tot = lambda x,y,z: x-y+z   # where x,y,z are names of arguments
print(tot(3,4,5))

In [None]:
#ANONYMOUS FUNCTİON
#Like lambda function but it can take more than one arguments.

#map(func,seq) : applies a function to all the items in a list
    
number_list = [1,2,3]
y = map(lambda x:x**2,number_list)
print(list(y))

In [None]:
#ITERATORS
# iteration example

name = "ronaldo"
it = iter(name)
print(next(it))    # print next iteration
print(next(it))    # print next iteration
print(next(it))    # print next iteration

print(*it)         # print remaining iteration

In [None]:
# zip example
list1 = [1,2,3,4]
list2 = [5,6,7,8]
z = zip(list1,list2)
print(z)  # it keeps an address

z_list = list(z)
print(z_list)

In [None]:
un_zip = zip(*z_list)
un_list1,un_list2 = list(un_zip) # unzip returns tuple
print(un_list1)
print(un_list2)
print(type(un_list2))

In [None]:
# Example of list comprehension
num1 = [1,2,3]
num2 = [i + 1 for i in num1 ]
print(num2)

In [None]:
# Conditionals on iterable
num1 = [5,10,15,20]
num2 = [i**2 if i == 10 else i-5 if i < 7 else i+5 for i in num1]
print(num2)

In [None]:
# lets return btc csv data and make one more list comprehension example
# lets classify btc mohtly_mean whether they have high or low price according to all_time mean. 
# Our threshold is all_time mean.
threshold = sum(data.weighted_price_mean)/len(data.weighted_price_mean)
data['threshold'] = sum(data.weighted_price_mean)/len(data.weighted_price_mean)
data["weighted_price_level"] = ["higher" if i > threshold else "lower" for i in data.weighted_price_mean]
data.loc[:,["weighted_price_mean","threshold","weighted_price_level"]] 

In [None]:
# CLEANING DATA

#DIAGNOSE DATA for CLEANING

In [None]:
data.head() #first five data

In [None]:
data.tail() #last five data

In [None]:
# columns gives column names of features
data.columns

In [None]:
# shape gives number of rows and columns in a tuble
data.shape

In [None]:
# info gives data type like dataframe, number of sample or row, number of feature or column, feature types and memory usage
data.info()

In [None]:
#EXPLORATORY DATA ANALYSIS
#value_counts(): Frequency counts
#outliers: the value that is considerably higher or lower from rest of the data

# For example lets look frequency of Volume_(BTC)
data
print(data['weighted_price_level'].value_counts(dropna =False))  # if there are nan values that also be counted

# As it can be seen below there are 1241716 NaN values in the data

In [None]:
# You can see sum basic information about data with describe() method 

data.describe() #ignore null entries

In [None]:
#VISUAL EXPLORATORY DATA ANALYSIS

# Box plots: visualize basic statistics like outliers, min/max or quantiles

#What is quantile?
#1,4,5,6,8,9,11,12,13,14,15,16,17
#The median is the number that is in middle of the sequence. In this case it would be 11.
#The lower quartile is the median in between the smallest number and the median i.e. in between 1 and 11, which is 6.
#The upper quartile, you find the median between the median and the largest number i.e. between 11 and 17, which will be 14 according to the question above.

# For example: compare weighted_price_mean of BTC that are weighted_price_level is Higher or Lower
# Black line at top is max
# Blue line at top is 75%
# Green line is median (50%)
# Blue line at bottom is 25%
# Black line at bottom is min
# There are no outliers

data.boxplot(column='weighted_price_mean',by = 'weighted_price_level')


In [None]:
# TIDY DATA
# We tidy data with melt(). Describing melt is confusing. Therefore lets make example to understand it.

data_new = data.head()    # I only take 5 rows into new data
data_new

In [None]:
# lets melt
# id_vars = what we do not wish to melt
# value_vars = what we want to melt
melted = pd.melt(frame=data_new,id_vars = 'weighted_price_mean', value_vars= ['volume_btc_sum','volume_currency_sum'])
melted

In [None]:
#PIVOTING DATA
#Reverse of melting.

# Index is name
# I want to make that columns are variable
# Finally values in columns are value

melted.pivot(index = 'weighted_price_mean', columns = 'variable',values='value')

In [None]:
# CONCATENATING DATA
# We can concatenate two dataframe

# Firstly lets create 2 data frame
data1 = data.head()
data2= data.tail()
conc_data_row = pd.concat([data1,data2],axis =0,ignore_index =True) #axis=0 : adds dataframes in row 
conc_data_row

In [None]:
data1 = data['weighted_price_mean'].head()
data2= data['volume_btc_sum'].head()
conc_data_col = pd.concat([data1,data2],axis =1) # axis = 1 : adds dataframes in column
conc_data_col

In [None]:

# DATA TYPES
# There are 5 basic data types: object(string),boolean, integer, float and categorical.
# We can make conversion data types like from str to categorical or from int to float
# Why is category important:
# make dataframe smaller in memory
# can be utilized for anlaysis especially for sklearn(we will learn later)

In [None]:
# type(data) this is about all data like DataFrame

data.dtypes # this is about columns


In [None]:
# lets convert object(str) to categorical and float to int.
data['weighted_price_level'] = data['weighted_price_level'].astype('category')
data['threshold'] = data['threshold'].astype('int')

In [None]:
# As you can see weighted_price_level is converted from object to category
# And threshold is converted from float to int
data.dtypes

In [None]:
# MISSING DATA and TESTING WITH ASSERT

# If we encounter with missing data, what we can do:

# leave as is
# drop them with dropna()
# fill missing value with fillna()
# fill missing values with test statistics like mean
# Assert statement: check that you can turn on or turn off when you are done with your testing of the program

In [None]:
# Lets look at does btc data have nan value
# As you can see there are 106 entries. However there is no null object because of i clean them before
data.info()

In [None]:
# Lets check weighted_price_mean
data["threshold"].value_counts(dropna =False)
# As you can see, there is no NAN value

In [None]:
data["test"] = [None if i<10 else 1 for i in data.weighted_price_mean] #added sum NaN values for test column

In [None]:
data["test"].value_counts(dropna = False)  #lets see count of values with NaN values

In [None]:
#  Lets check with assert statement
# Assert statement:
assert 1==1 # return nothing because it is true

In [None]:
# In order to run all code, we need to make this line comment
# assert 1==2 # return error because it is false you can check and see error

In [None]:
# In order to run all code, we need to make this line comment
#assert data['test'].notnull().all() # returns error  because it is false


In [None]:
assert data['weighted_price_mean'].notnull().all() # returns nothing because we drop nan values

In [None]:
# Lets fill drop NaN values 
# data = data["test"].dropna(inplace = True)  # inplace = True means we do not assign it to new variable. Changes automatically assigned to data

# Lets fill nan values with empty
data["test"].fillna('empty',inplace = True)
data["test"].value_counts(dropna = False)  #lets see count of empty values 

In [None]:
assert data['test'].notnull().all() # returns nothing  because it is true

In [None]:
# We can build data frames from csv as we did earlier.
# Also we can build dataframe from dictionaries

# data frames from dictionary
country = ["Spain","France","Germany","Turkey"]
team = ["Barcelona","PSG","Bayern","Fenerbahce"]
list_label = ["country","team"]
list_col = [country,team]
zipped = list(zip(list_label,list_col))
data_dict = dict(zipped)
df = pd.DataFrame(data_dict)
df


In [None]:
# Add new columns
df["fan"] = ["11","12","13","35"]
df

In [None]:
# Broadcasting
df["income"] = 0 #Broadcasting entire column
df

VISUAL EXPLORATORY DATA ANALYSIS

* Plot
* Subplot
* Histogram:
    * bins: number of bins
    * range(tuble): min and max values of bins
    * normed(boolean): normalize or not
    * cumulative(boolean): compute cumulative distribution

In [None]:
# Plotting all data 
data1 = data.loc[:,["weighted_price_mean","volume_btc_sum","volume_currency_sum"]]
data1.plot()
plt.show()

In [None]:
# subplots
data1.plot(subplots = True)
plt.show()

In [None]:
# scatter plot  
data1.plot(kind = "scatter",x="volume_currency_sum",y = "weighted_price_mean")
plt.show()

In [None]:
# histogram plot  
data1.plot(kind = "hist",y = "weighted_price_mean",bins = 50,range= (0,20000))
plt.show()

In [None]:
# histogram subplot with non cumulative and cumulative
fig, axes = plt.subplots(nrows=2,ncols=1)
data1.plot(kind = "hist",y = "weighted_price_mean",bins = 50,range= (0,20000),ax = axes[0]) #non cumuşative
data1.plot(kind = "hist",y = "weighted_price_mean",bins = 50,range= (0,20000),ax = axes[1],cumulative = True) #cumulative
#plt.savefig('graph.png')
plt.show()


# INDEXING PANDAS TIME SERIES
* datetime = object
* parse_dates(boolean): Transform date to ISO 8601 (yyyy-mm-dd hh:mm:ss ) format

In [None]:
time_list = ["1992-03-08","1992-04-12"]
print(type(time_list[1])) # As you can see date is string

# however we want it to be datetime object
datetime_object = pd.to_datetime(time_list)
print(type(datetime_object))

print("")
print(datetime_object)

In [None]:
# as you can see we have PeriodIndex
type(data.index)

# you can see here PeriodIndex again at head of output
# data.info() 

In [None]:
data.loc["2018-09"]

In [None]:
# close warning
import warnings
warnings.filterwarnings("ignore")

# In order to practice lets take head of btc data and add it a time list 
data2 = data.head()
date_list = ["2012-01-30","2012-01-31","2013-01-31","2013-02-28","2013-04-30"]
datetime_object = pd.to_datetime(date_list)
data2["date"] = datetime_object

# lets make date as index
data2= data2.set_index("date")
data2 

In [None]:
# Now we can select according to our DateIndex

print(data2.loc["2012-01-30"])
print(data2.loc["2012-01-31":"2013-02-28"])

# RESAMPLING PANDAS TIME SERIES
* Resampling: statistical method over different time intervals
    *   Needs string to specify frequency like "M" = month or "A" = year
* Downsampling: reduce date time rows to slower frequency like from daily to weekly
* Upsampling: increase date time rows to faster frequency like from daily to hourly
* Interpolate: Interpolate values according to different methods like ‘linear’, ‘time’ or index’
    *   https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.interpolate.html

In [None]:
# We will use data2 that we create at previous part 
data2.resample("A").mean()

In [None]:
# Lets resample with month
data2.resample("M").mean()
# As you can see there are lots of NaN because data2 does not include all months

In [None]:
# In real life (data is real. Not created from us like data2) we can solve this problem with interpolate
# For example we can interpolete from with mean()
data2.resample("M").mean().interpolate("linear")

In [None]:
a = [0,1,2,3,4]

a[0]

for a[0] in a:

    print(a[0])

# MANIPULATING DATA FRAMES WITH PANDAS
  INDEXING DATA FRAMES
    * Indexing using square brackets
    * Using column attribute and row label
    * Using loc accessor
    * Selecting only some columns

In [None]:
data.info() # we have PeriodIndex

In [None]:
#then and we want to change the index to number 0-105
data["new_index"] = 0


i=0 
for i in range(106):
    data["new_index"][i] = i
    
data.head() # we added new column to assign new index (new_index column)


In [None]:
data= data.set_index("new_index") #we are changing the index to new_index column
data.head()

In [None]:
# indexing using square brackets
data["weighted_price_mean"][1]   #second value will turn

In [None]:
# indexing using column attribute and row label
data.weighted_price_mean[1]

In [None]:
# using loc accessor
data.loc[1,["weighted_price_mean"]]

In [None]:
# Selecting only some columns
data[["weighted_price_mean","threshold","weighted_price_level"]]

# SLICING DATA FRAME
*     Difference between selecting columns
    *     Series and data frames
*     Slicing and indexing series
*     Reverse slicing
*     From something to end

In [None]:
data.head()

In [None]:
# Difference between selecting columns: series and dataframes
print(type(data["weighted_price_mean"]))     # series
print(type(data[["weighted_price_mean"]]))   # data frames

In [None]:
# Slicing and indexing series
data.loc[1:10,"weighted_price_mean":"volume_currency_sum"]   # 10 and "volume_currency_sum" are inclusive

In [None]:
# Reverse slicing 
data.loc[10:1:-1,"weighted_price_mean":"volume_currency_sum"] 

In [None]:
# From something to end
data.loc[1:10,"threshold":] 


# FILTERING DATA FRAMES
Creating boolean series Combining filters Filtering column based others

In [None]:
# Creating boolean series
boolean = data.weighted_price_mean > 10000  

data[boolean]  #returns true values

In [None]:
# Combining filters
first_filter = data.weighted_price_mean > 10000
second_filter = data.volume_btc_sum > 400000

data[first_filter & second_filter]  # apply 2 filter with and condition

In [None]:
# Filtering column based others
data.weighted_price_mean[data.volume_btc_sum>800000]

# TRANSFORMING DATA
* Plain python functions
* Lambda function: to apply arbitrary python function to every element
* Defining column using other columns

In [None]:
# Plain python functions
def div(n):
    return n/2

data.weighted_price_mean.apply(div)

In [None]:
# Or we can use lambda function
data.weighted_price_mean.apply(lambda n : n/2)

In [None]:
# Defining column using other columns
data["total_volume"] = data.volume_btc_sum + data.volume_currency_sum
data.head()

# INDEX OBJECTS AND LABELED DATA
* index: sequence of label

In [None]:
# our index name is this:
print(data.index.name)

# lets change it
data.index.name = "index_name"
data.head()

In [None]:
# Overwrite index

# if we want to modify index, we need to change all of them.
data.head()

# first copy of our data to data1 then change index 
data1 = data.copy()

# lets make index start from 100. It is not remarkable change but it is just example
data1.index = range(100,206,1)
data1.head()

In [None]:
# We can make one of the column as index. We actually did it at the beginning of manipulating data frames with pandas section
# It was like this
# data= data.set_index("new_index")
# also you can use 
# data.index = data["new_index"]

# HIERARCHICAL INDEXING
* Setting indexing

In [None]:
# lets read data frame one more time to start from beginning
data.head()
# As you can see there is index. However we want to set one or more column to be index

In [None]:
# Setting index : test column is outer weighted_price_level is inner index
data1 = data.set_index(["test","weighted_price_level"]) 
data1.head(100)
# data1.loc["Fire","Flying"] # howw to use indexes


# PIVOTING DATA FRAMES
* pivoting: reshape tool

In [None]:
dic = {"treatment":["A","A","B","B"],"gender":["F","M","F","M"],"response":[10,45,5,9],"age":[15,4,72,65]}
df = pd.DataFrame(dic)
df

In [None]:
# pivoting
df.pivot(index="treatment",columns = "gender",values="response")

# STACKING and UNSTACKING DATAFRAME
* deal with multi label indexes
* level: position of unstacked index
* swaplevel: change inner and outer level index position

In [None]:
df1 = df.set_index(["treatment","gender"])
df1
# lets unstack it

In [None]:
# level determines indexes
df1.unstack(level=0)

In [None]:
df1.unstack(level=1)

In [None]:
# change inner and outer level index position
df2 = df1.swaplevel(0,1)
df2

# MELTING DATA FRAMES
* Reverse of pivoting

In [None]:
df

In [None]:
# df.pivot(index="treatment",columns = "gender",values="response")
pd.melt(df,id_vars="treatment",value_vars=["age","response"])

# CATEGORICALS AND GROUPBY

In [None]:
df

In [None]:
# according to treatment take means of other features
df.groupby("treatment").mean()   # mean is aggregation / reduction method
# there are other methods like sum, std,max or min

In [None]:
# we can only choose one of the feature
df.groupby("treatment").age.max() 

In [None]:
# Or we can choose multiple features
df.groupby("treatment")[["age","response"]].min() 

In [None]:
df.info()
# as you can see gender is object
# However if we use groupby, we can convert it categorical data. 
# Because categorical data uses less memory, speed up operations like groupby
#df["gender"] = df["gender"].astype("category")
#df["treatment"] = df["treatment"].astype("category")
#df.info()