In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
original_dataset = pd.read_csv('data_class.csv')

In [3]:
attrs = [
    'total_sales',
    'exempt_sales',
    'sgst_liability',
    'cgst_liability',
    'igst_liability',
    'total_liability',
    'sgst_cashsetoff',
    'cgst_cashsetoff',
    'igst_cashsetoff',
    'total_cashsetoff',
    'sgst_itc_claimed',
    'cgst_itc_claimed',
    'igst_itc_claimed',
    'total_itc_claimed'
]

In [4]:
original_dataset[attrs].describe()

Unnamed: 0,total_sales,exempt_sales,sgst_liability,cgst_liability,igst_liability,total_liability,sgst_cashsetoff,cgst_cashsetoff,igst_cashsetoff,total_cashsetoff,sgst_itc_claimed,cgst_itc_claimed,igst_itc_claimed,total_itc_claimed
count,98310.0,98310.0,98310.0,98310.0,98310.0,98310.0,98310.0,98310.0,98310.0,98310.0,98310.0,98310.0,98310.0,98310.0
mean,69927080.0,1059383.0,348867.8,348836.0,244164.9,941868.7,50277.18,26986.32,33906.14,111169.6,270625.9,270986.9,333193.5,874806.4
std,19457310000.0,53972680.0,2232970.0,2233034.0,3421634.0,6331791.0,800538.8,342779.9,696670.4,1338797.0,1959146.0,1962724.0,3458046.0,6309488.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,151601.2,0.0,8613.875,8621.25,0.0,26542.8,0.0,0.0,0.0,0.0,3171.68,3172.37,0.0,19478.5
50%,570186.3,0.0,39949.0,39950.5,0.0,101365.6,0.0,0.0,0.0,50.0,27730.83,27709.5,0.0,92081.07
75%,1772633.0,0.0,127727.5,127727.5,12778.92,314649.8,2900.0,1800.0,0.0,12225.0,101804.8,101763.0,36929.75,289380.0
max,6100188000000.0,5898273000.0,172781600.0,172781600.0,306488300.0,415461800.0,72820420.0,32107380.0,63598600.0,90677310.0,215146000.0,215146000.0,207946700.0,637952500.0


In [5]:
original_dataset.shape

(98310, 16)

In [6]:
dataset = original_dataset.drop(original_dataset[original_dataset['total_liability'] == 0].index)
dataset.shape

(92025, 16)

In [7]:
dataset = dataset.drop(dataset[dataset['total_itc_claimed'] == 0].index)
dataset.shape

(85187, 16)

In [15]:
dataset[dataset['total_sales'] == 0]

Unnamed: 0,id,mnth,total_sales,exempt_sales,sgst_liability,cgst_liability,igst_liability,total_liability,sgst_cashsetoff,cgst_cashsetoff,igst_cashsetoff,total_cashsetoff,sgst_itc_claimed,cgst_itc_claimed,igst_itc_claimed,total_itc_claimed
16,4,32018,0.0,0.0,1485.0,1485.0,0.0,2970.0,1485.0,1485.0,0.0,2970.0,1485.00,1485.00,91746.00,94716.00
17,4,122017,0.0,0.0,1485.0,1485.0,0.0,2970.0,1485.0,1485.0,0.0,2970.0,1485.00,1485.00,0.00,2970.00
91,18,12018,0.0,0.0,1108.0,1108.0,0.0,2216.0,1108.0,1108.0,0.0,2216.0,536694.00,536694.00,307932.00,1381320.00
94,18,72017,0.0,0.0,79887.0,79887.0,0.0,159774.0,79887.0,79887.0,0.0,159774.0,28118.00,28118.00,74816.00,131052.00
100,18,102017,0.0,0.0,9022.0,9022.0,0.0,18044.0,9022.0,9022.0,0.0,18044.0,261395.00,261395.00,570835.00,1093625.00
158,25,92017,0.0,0.0,9000.0,9000.0,0.0,18000.0,9000.0,9000.0,0.0,18000.0,307091.00,307091.00,1535481.00,2149663.00
284,38,82017,0.0,0.0,600.0,600.0,0.0,1200.0,600.0,600.0,0.0,1200.0,265134.00,265134.00,7114.00,537382.00
285,38,52018,0.0,0.0,200.0,200.0,0.0,400.0,200.0,200.0,0.0,400.0,18123.00,18123.00,8262.00,44508.00
288,38,122017,0.0,0.0,315.0,315.0,0.0,630.0,315.0,315.0,0.0,630.0,23297.00,23297.00,35182.00,81776.00
289,38,22018,0.0,0.0,200.0,200.0,0.0,400.0,200.0,200.0,0.0,400.0,310639.00,310639.00,693.00,621971.00


### Performing Benford Analysis on individual attributes

In [8]:
from math import log10

def benford_analysis(vals):
    # Expected Benford frequencies
    expected_freq = [round(len(vals) * log10(1 + 1/d)) for d in range(1, 10)]

    # Convert to scientific notation and get first digit
    first_digit = lambda x: int(('%e' % x)[0])

    # Get observed first digit frequencies
    observed_freq = [0] * 10
    for val in vals:
        observed_freq[first_digit(val)] += 1
    observed_freq = observed_freq[1:]

    print('Expected frequencies: ', expected_freq)
    print('Observed frequencies: ', observed_freq)

    mean_abs_dev = 1/(len(vals)*9) * sum([abs(obv-exp)
                                          for obv, exp in zip(observed_freq, expected_freq)])
    print('Mean absolute deviation: %.6lf' % mean_abs_dev)
    
    plt.plot(range(1, 10), expected_freq, label='Expected')
    plt.plot(range(1, 10), observed_freq, label='Observed')
    plt.legend(loc='upper right')
    plt.show()

In [None]:
for attr in attrs:
    print('\t\t\t\t\t', attr.upper())
    benford_analysis(dataset[attr])
    print('\n')

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [None]:
X = np.array(dataset['total_liability'][:1000]).reshape(-1, 1)
Y = np.array(dataset['sgst_liability'][:1000])

In [None]:
model = LinearRegression(normalize=True).fit(X, Y)

In [None]:
plt.plot(X, Y, 'b.')
plt.plot(X, model.predict(X), 'r')
plt.show()

In [None]:
pred = lambda x: model.coef_ * x + model.intercept_

In [None]:
pred(np.array(dataset['total_sales'])[:4]) - dataset['total_sales'][:4]

In [None]:
model.predict(np.array(dataset['total_sales'][:2]).reshape(-1, 1))

In [None]:
dataset['total_liability'][:2]