This notebook deals with creating sample data from train data
1. Read Pickel file
2. Create sample data frame, write to sample_Data directory
3. Start with Features on That Implementation

In [1]:
import pandas as pd
import datetime
import os
import gc

In [2]:
#FILES
TRAIN_FILE_RAW = 'input/train_raw.csv'
TEST_FILE_RAW = 'input/test_raw.csv'

#EDA CONFIG
EDA_CONFIG=True
EDA_NROWS = 1000000
comment_for_file_name="" #'_'+'comment'

In [3]:
def is_train_file(file):
    return 'train' in file
def get_dtypes(file):
    dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'click_id'      : 'uint32'}
    if is_train_file(file):
        dtypes['is_attributed']='uint8'
    return dtypes
def parsed_file_name(raw_file):
    return raw_file.replace('raw','parsed'+comment_for_file_name)
def parse_raw_file(raw_file):
    parsed_file=None
    if 'raw' in raw_file:
        parsed_file=raw_file.replace('raw','parsed')
    else:
        print(raw_file,' is not a raw file')
        return
    if os.path.isfile(parsed_file):
        os.remove(parsed_file)
        print('deleting old ',parsed_file)
        
    df = pd.read_csv(raw_file,parse_dates=['click_time'])
    print('parsing click_time...')
    df['click_time']=pd.to_datetime(df['click_time']) #convert the click_time 
    print('parsing click_date...')
    df['click_date'] = df['click_time'].dt.date
    print('parsing hour...')
    df['hour'] = pd.to_datetime(df.click_time).dt.hour.astype('uint8')
    print('parsing day')
    df['day'] = pd.to_datetime(df.click_time).dt.day.astype('uint8')
    print('parsing minute')
    df['minute'] = pd.to_datetime(df.click_time).dt.minute.astype('uint8')
    return df

In [9]:
def parse_file(file_name):
    if file_name is None:
        return 'NO FILE NAME'
    print('reading raw file...',file_name)
    df_parsed=parse_raw_file(file_name)
    print('head of file')
    df_parsed.head()
    print('writing file',parsed_file_name(file_name))
    df_parsed.to_csv(parsed_file_name(file_name),index=False)
    return parsed_file_name(file_name),df_parsed

In [5]:
def get_sample_filename(file):
    return file.replace('parsed','sample'+comment_for_file_name)
def get_sample(df_parsed,file):
    if(df_parsed) is None or (file is None):
        return 'GOT NONE, please assign'
    n=None
    if EDA_CONFIG:
        n=EDA_NROWS
        path='eda/'+get_sample_filename(file)
    print('sampling...')
    df_sample=df_parsed.sample(n=EDA_NROWS)
    df_sample_to_write=df_sample.reset_index(drop=True)
    print('writing...')
    df_sample_to_write.to_csv(path,index=False)
    return path,df_sample_to_write

In [12]:
###############
file= TEST_FILE_RAW #TRAIN_FILE_RAW


In [13]:
parsed_file,df_parsed = parse_file(file)
sample_file,df_sample=get_sample(df_parsed,parsed_file)
print('done')

reading raw file... input/test_raw.csv
deleting old  input/test_parsed.csv
parsing click_time...
parsing click_date...
parsing hour...
parsing day
parsing minute
head of file
writing file input/test_parsed.csv
sampling...
writing...
done


In [8]:
print('done')

done


In [10]:
df_sample.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,click_date,hour,day,minute
0,46371,12,1,14,481,2017-11-08 04:15:09,,0,2017-11-08,4,8,15
1,46680,11,1,19,325,2017-11-08 11:05:23,,0,2017-11-08,11,8,5
2,28564,18,1,13,107,2017-11-06 17:35:13,,0,2017-11-06,17,6,35
3,44527,6,1,15,459,2017-11-08 13:01:18,,0,2017-11-08,13,8,1
4,12505,2,1,25,452,2017-11-08 06:18:29,,0,2017-11-08,6,8,18


In [16]:
df_parsed.hour.unique()

array([ 4,  5,  6,  9, 10, 11, 13, 14, 15], dtype=uint64)

In [17]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 11 columns):
click_id      1000000 non-null int64
ip            1000000 non-null int64
app           1000000 non-null int64
device        1000000 non-null int64
os            1000000 non-null int64
channel       1000000 non-null int64
click_time    1000000 non-null datetime64[ns]
click_date    1000000 non-null object
hour          1000000 non-null uint8
day           1000000 non-null uint8
minute        1000000 non-null uint8
dtypes: datetime64[ns](1), int64(6), object(1), uint8(3)
memory usage: 63.9+ MB


In [18]:
df_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18790469 entries, 0 to 18790468
Data columns (total 11 columns):
click_id      int64
ip            int64
app           int64
device        int64
os            int64
channel       int64
click_time    datetime64[ns]
click_date    object
hour          uint8
day           uint8
minute        uint8
dtypes: datetime64[ns](1), int64(6), object(1), uint8(3)
memory usage: 1.2+ GB
