<a href="https://colab.research.google.com/github/ryanhao1115/ML-for-Fraud-Detection/blob/main/1_data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data collection and cleaning
1. Explore datasets, first round filtering data fields.
2. Rename columns.
3. Cleaning data.
4. Combine datasets into one single table. 

In [2]:
import pandas as pd

In [3]:
def import_data(filename):
  ''' 
  Sales dataset were exported from SAP into excel files.  One for each month.
  import one excel file into pandas DataFrame
  '''
  path = '/content/drive/MyDrive/Colab Notebooks/finalproject/'
  file_path = path + filename
  df = pd.read_excel(file_path)
  return df

In [4]:
df = import_data('may.XLSX')

In [5]:
def select_fields(df):
  '''
  Select fields related to this project. 
  Rename columns.
  '''
  col = [0,3,6,16,24,25,27,31,38,40,42,43,50,53,65,76,77,80,83,86]
  col_names = ['distributor','sales','branch','inv_type','invoice_no','line_item','product_no','prod_cla','qty','total_amt','sale_price','status','ship_qty','cust_type','return','discount_app','list_price', 'invoice_date','ship_date','request_ship']
  df = df.iloc[:,col]
  df.columns = col_names
  return df


In [6]:
df1 = select_fields(df)

In [7]:
def fillna_filter(df):
  '''
  Fill all missing values. 
  Filter records according to business nature. 
  '''
  df['cust_type'] = df['cust_type'].fillna(value='No')  ## set Nan records as a new type
  df['return'] = df['return'].fillna(value = 'No')    ## Nan means no return
  df['discount_app'] = df['discount_app'].fillna(value = 'No')  ## Nan means no discount
  df['prod_cla'] = df['prod_cla'].fillna(value = '9999')  ## set Nan records as a new class
  df['ship_date'] = df['ship_date'].fillna(value = '2019-12-31')  ## Didn't ship until the end of the year
  df = df[df['status'] == '完成请求']     ## Only for completed invoices
  return df



In [None]:
df_total = fillna_filter(df1)

In [9]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86056 entries, 0 to 86070
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   distributor   86056 non-null  object        
 1   sales         86056 non-null  object        
 2   branch        86056 non-null  int64         
 3   inv_type      86056 non-null  object        
 4   invoice_no    86056 non-null  int64         
 5   line_item     86056 non-null  int64         
 6   product_no    86056 non-null  object        
 7   prod_cla      86056 non-null  object        
 8   qty           86056 non-null  int64         
 9   total_amt     86056 non-null  float64       
 10  sale_price    86056 non-null  float64       
 11  status        86056 non-null  object        
 12  ship_qty      86056 non-null  int64         
 13  cust_type     86056 non-null  object        
 14  return        86056 non-null  object        
 15  discount_app  86056 non-null  object

In [10]:
def main(filename,df_total):
  '''
  Combine all functions for data import and cleaning. 
  Concat all data into one df in order.
  '''
  df = import_data(filename)
  df = select_fields(df)
  df = fillna_filter(df)
  df_total = pd.concat([df_total,df],axis=0)
  return df_total

In [18]:
df_total = main('july2.XLSX',df_total)

In [19]:
df_total['invoice_date'].max()

Timestamp('2019-07-31 00:00:00')

In [20]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 248825 entries, 0 to 45763
Data columns (total 20 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   distributor   248825 non-null  object        
 1   sales         248825 non-null  object        
 2   branch        248825 non-null  int64         
 3   inv_type      248825 non-null  object        
 4   invoice_no    248825 non-null  int64         
 5   line_item     248825 non-null  int64         
 6   product_no    248825 non-null  object        
 7   prod_cla      248825 non-null  object        
 8   qty           248825 non-null  int64         
 9   total_amt     248825 non-null  float64       
 10  sale_price    248825 non-null  float64       
 11  status        248825 non-null  object        
 12  ship_qty      248825 non-null  int64         
 13  cust_type     248825 non-null  object        
 14  return        248825 non-null  object        
 15  discount_app  2488

In [None]:
df_total.tail(3)

In [22]:
df_total.describe()

Unnamed: 0,branch,invoice_no,line_item,qty,total_amt,sale_price,ship_qty,list_price
count,248825.0,248825.0,248825.0,248825.0,248825.0,248825.0,248825.0,248825.0
mean,715065.076918,2266587000.0,122.754144,1.815529,4610.675,3198.007,1.852169,3619.291
std,56423.284561,534810700.0,204.562274,11.153265,28015.79,14382.95,10.292674,16345.24
min,72010.0,1103721000.0,10.0,-542.0,-4004956.0,-30695.69,0.0,-35607.0
25%,720008.0,2111335000.0,40.0,1.0,398.23,403.19,1.0,456.43
50%,720013.0,2111365000.0,80.0,1.0,2094.83,2123.89,1.0,2400.0
75%,720021.0,2111394000.0,130.0,1.0,4017.97,3628.32,1.0,4100.0
max,720037.0,4200000000.0,3960.0,1189.0,2051452.0,4004956.0,1189.0,4525600.0


In [23]:
## Extract dataset for next steps
path = '/content/drive/MyDrive/Colab Notebooks/finalproject/'
file_csv = path + 'sales.csv'
df_total.to_csv(file_csv)