Outlier Study
========
Here we'll first read the csv file containing the top 37 pair of coins as per hourly trades & hence we'll get a list of those pairs. 

In [1]:
import pandas as pd
import json
import datetime
import numpy as np

In [2]:
fp = open("input_csv/ed_trade_data.json")
jsondata = json.load(fp)
pairlist = jsondata.keys()

In [None]:
pairlist

In [None]:
def DataManipulation(df_in):
    ''' This function is for converting object type columns to numeric type columns of the dataframe which is 
    passed as the argument here and returning the required dataframe with all numeric columns. ''' 
    
    df_in['datetime'] = pd.to_datetime(df_in['date'])
    df_in['price'] = df_in['price'].astype(float)
    df_in['amount'] = df_in['amount'].astype(float)
    df_in['amountBase'] = df_in['amountBase'].astype(float)
    required_dataframe = df_in[['datetime', 'amount', 'amountBase', 'price']].set_index('datetime')
    return required_dataframe

In [None]:
def DetectOutlier(input_df, col_name):
    ''' This function is taking a dataframe & one of its column as arguments and then calculating the 
    1st and 3rd quartiles. iqr is the variable where it's calculating the Inter Quartile Range and hence 
    it's calculating the upper and lower bound and checking  whether all the values of that particular
    column belongs withing that boundary. df_out is returning the values which are out of the range. '''
    
    q1 = input_df[col_name].quantile(0.25)
    q3 = input_df[col_name].quantile(0.75)
    iqr = (q3 - q1) #Interquartile range
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    df_out = input_df[(input_df[col_name] < lower_bound) | (input_df[col_name] > upper_bound)]
    return df_out

In [None]:
def ProcessPairlist(data, pair_name):
    ''' This function is taking a dictionary and one of the top traded pair name as arguments and then calling
    another function DataManipulation to make the readymade dataframe and then calling another function 
    DetectOutliers that takes the resultant dataframe as the argument of the previous function to detect 
    the outliers in that dataframe. '''
    
    TradeDataframe = pd.DataFrame(data)
    manipulated_data = DataManipulation(TradeDataframe)
    outliers = DetectOutlier(manipulated_data, 'price')
    outliers.to_csv('output_files/outlier_study/OutlierReport_'+ pair_name +'.csv')
    summary = [pair_name] + PrepareSummary(TradeDataframe, outliers)
    return summary

In [None]:
def PrepareSummary(df1, df2):
    ''' This function is determining the percentage of the outliers. For that it's first determining the shape
    of df1 and the df2 which are passed as arguments in this. Then it's determining amount of data of those two
    dataframes. Then it's calculating the relative percentage of the outliers in the dataframe for each pair of 
    coins and reurning a list of the summary. '''
   
    row1, col1 = df1.shape
    row2, col2 = df2.shape
    relative_percentage = (float(row2)/float(row1)) * 100
    summary = [row1, row2, relative_percentage]
    return summary

In [None]:
def ReportMaking(summary_list, old_report):
    ''' This function is taking the list of summary of outliers as argument and producting a well formated 
    Dataframe containing summary of Outliers.'''
    
    report = pd.DataFrame(summary_list).sort_values(by = 2)
    report.columns = ['pair', 'total_data', 'outliers_in_data', '%_of_outliers']
    initial_report = old_report[['pair', 'min_date', 'max_date', 'total_time', 'no_of_trades', 'avg_hourly_trades', 'volume_traded', 'total_turnover', 'avg_price_of_unit_volume']]
    summary_report = pd.merge(initial_report, report, how = 'left', left_on = 'pair', right_on = 'pair')
    final_report = summary_report.set_index('pair').sort_values(by = '%_of_outliers')
    return final_report

In [None]:
top_hourly_traded_pairs = pd.read_csv('output_files/initial_study/top_hourly_traded_pairs.csv')
PairList = list(top_hourly_traded_pairs['pair'])
info_outliers = []
for pair in PairList:
    
    TradeData = jsondata[pair]
    SummaryOutliers = ProcessPairlist(TradeData, pair)
    info_outliers.append(SummaryOutliers)

SummaryReport = ReportMaking(info_outliers, top_hourly_traded_pairs)


In [None]:
def GetStudyData(TradeData):
    
    q1 = TradeData.price.quantile(0.25)
    q3 = TradeData.price.quantile(0.75)
    if (TradeData.price < q1):
        TradeData.price = q1
        TradeData.amountBase = TradeData.amount * q1
    elif (TradeData.price > q3):
        TradeData.price = q3
        TradeData.amountBase = TradeData.amount * q3
    else:
        TradeData.price = TradeData.price
        TradeData.amountBase = (TradeData.amount) * (TradeData.price)
    return TradeData

In [32]:
x = jsondata['ETH_NEWB'][1]

In [3]:
class Pair(object):
    def __init__(self):
        self.pairs = None
    def get_list(self, data):
        self.pairs = data.keys()
        return self.pairs

class Trade(object):
    
    def __init__(self, _dict):
        ''' This function initiates the class trades, updates the trade informations one by one'''
        self.__dict__.update(_dict)
        
    def __repr__(self):
        '''This function returns the representation of the Trade class'''
        return "Trade(%s, %s, %s, %s, %s, %s, %s, %s, %s)" %(self.amount, self.amountBase, self.buyer, self.date, self.price, self.seller, self.side, self.tokenAddr, self.txHash)         
    


In [33]:
x

{u'amount': u'14',
 u'amountBase': u'0.0005',
 u'buyer': u'0x3075f7f189912f4f425dd7021b9c396434197477',
 u'date': u'2017-11-02T12:58:14.000Z',
 u'price': u'0.00003571428571428571',
 u'seller': u'0xb1dc1e7675ded0f5185f2e005287ca8d6cb03906',
 u'side': u'buy',
 u'tokenAddr': u'0x814964b1bceaf24e26296d031eadf134a2ca4105',
 u'txHash': u'0x02c355864b3ab698edbe91c91214558307c6206ab8ab8f7a8736fd1043e2c36e'}

In [34]:
trade = Trade(x)

In [35]:
trade.amount

u'14'

In [38]:
a = []
for i in pairlist:
    pairdata = jsondata[i]
    for j in pairdata:
        b = (i, Trade(j))
        a.append(b)
pd.DataFrame(a)


Unnamed: 0,0,1
0,ETH_NEWB,"Trade(14, 0.0005, 0x4fda9e6bcd83af6c5cc771b5d2..."
1,ETH_NEWB,"Trade(14, 0.0005, 0x3075f7f189912f4f425dd7021b..."
2,ETH_NEWB,"Trade(11637, 0.4247505, 0x13e2c5f84bce6ff7b906..."
3,ETH_NEWB,"Trade(697, 0.018819, 0x13e2c5f84bce6ff7b906831..."
4,ETH_NEWB,"Trade(2499, 0.067473, 0x13e2c5f84bce6ff7b90683..."
5,ETH_NEWB,"Trade(700, 0.0189021, 0xafb7144110020e048d3272..."
6,ETH_NEWB,"Trade(30, 0.00108, 0x0b773d7819259d78757b26dc4..."
7,ETH_NEWB,"Trade(0, 0, 0xf36990a2cb4d609e0a7d4f3bf884c034..."
8,ETH_NEWB,"Trade(947, 0.025569, 0x13e2c5f84bce6ff7b906831..."
9,ETH_NEWB,"Trade(0, 0, 0xf36990a2cb4d609e0a7d4f3bf884c034..."


In [None]:
class Pair(object):
    def __init__(self):
        self.pairs = None
    def get_list(self, data):
        self.pairs = data.keys()
        return self.pairs

In [None]:
Pair().get_list(jsondata)