In [1]:
import pandas as pd
import dask.dataframe as dd
import modin.pandas as mpd
import time
import ray
import numpy as np

# Import datasets using pandas, dask and modin

In [9]:
# import data using pandas
train_start = time.time()
df_trian = pd.DataFrame(pd.read_csv('train_new.csv'))
train_end = time.time()
print('The time for importing trainning data is:',train_end - train_start)

The time for importing trainning data is: 0.6596784591674805


In [10]:
# using dask
train_start = time.time()
df_train = dd.read_csv('train_new.csv')
train_end = time.time()
print('The time for importing trainning data is:',train_end - train_start)

The time for importing trainning data is: 0.0704045295715332


In [11]:
df_train = df_train.drop('Unnamed: 0',axis = 1)

In [12]:
def eda(df):
    print(df.head(2),'\n')
    print(df.info())
    print(df.describe(),'\n')
    print(df.isna().sum())
    print('Number of data point is: ', len(df))



### trian utility file

In [13]:
%%writefile trainutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re

#path = 'C:\Users\shuny\OneDrive\Desktop\Data Glacier\Week 6\train_file.yamlm'
def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string


def col_header_val(df,table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0



Overwriting trainutility.py


### Train YAML file

In [44]:
%%writefile train_file.yaml
file_type: csv
dataset_name: train_df_new
file_name: train_new
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - customer_ID
    - S_2
    - P_2
    - D_39
    - B_1
    - B_2

Overwriting train_file.yaml


In [45]:
import trainutility as train_util
config_data = train_util.read_config_file("train_file.yaml")

In [46]:
config_data['inbound_delimiter']

','

In [47]:
config_data

{'file_type': 'csv',
 'dataset_name': 'train_df_new',
 'file_name': 'train_new',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2']}

In [48]:
# read the file using config file
file_type = config_data['file_type']
source_file = "./" + config_data['file_name'] + f'.{file_type}'
df = pd.read_csv(source_file,config_data['inbound_delimiter'])
df = df.drop(['Unnamed: 0.1','Unnamed: 0'],axis = 1)
df = df.iloc[:,:6]
df.head()



Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0,0.008724,1.006838
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0,0.004923,1.000653
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,3,0.021655,1.009672
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0,0.013683,1.0027
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0,0.015193,1.000727


In [49]:
train_util.col_header_val(df,config_data)

column name and column length validation passed


1

In [50]:
print("columns of files are:" ,df.columns)
print("columns of YAML are:" ,config_data['columns'])

columns of files are: Index(['customer_id', 's_2', 'p_2', 'd_39', 'b_1', 'b_2'], dtype='object')
columns of YAML are: ['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2']


In [51]:
if train_util.col_header_val(df,config_data)==0:
    print("validation failed")
else:
    print("col validation passed")

column name and column length validation passed
col validation passed
