## Preprocessing
1. Loading files
2. Concateneations of train and test data
3. Unduplication 

In [None]:
import sys
import time
import numpy as np
import pandas as pd
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
from pandas.core.common import array_equivalent
import pickle
import gc
from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        tmin, tsec = divmod((datetime.now() - start_time).total_seconds(), 60)
        print(' Time taken: %i minutes and %s seconds.' %
              (tmin, round(tsec, 2)))

def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if array_equivalent(ia, ja):
                    dups.append(cs[i])
                    break

    return dups

print('Starting!')

#################################################################################
####  Numeric data ###
################################
train_chunks = pd.read_csv("train_numeric.csv", index_col=0)
train = train_chunks.drop(['Response'], axis=1)
test = pd.read_csv("test_numeric.csv", index_col=0)
train_test = pd.concat([train, test])
print('Concatenation OK!')
ntrain = train.shape[0]
ntest = test.shape[0]
del(test, train)


x_train = train_test.iloc[:ntrain, :]
x_test = train_test.iloc[ntrain:, :]

train_labels = np.array(train_chunks['Response'])
train_ids = x_train.index
test_ids = x_test.index
col_train = x_train.columns
col_test = x_test.columns
train_df = pd.DataFrame(data=x_train, index=train_ids, columns=col_train)
test_df = pd.DataFrame(data=x_test, index=test_ids, columns=col_test)
labels_y = pd.DataFrame(data=train_labels,index=train_ids,columns=['Response'])

print(train_df.shape,test_df.shape,labels_y.shape)

train_df.to_pickle('As_is/train_num_df.pkl')
test_df.to_pickle('As_is/test_num_df.pkl')
labels_y.to_pickle('As_is/labels_y.pkl')

del(train_df,test_df,labels_y, x_train, x_test,)
gc.collect()

print('ok!')
#################################################################################
#### date data ###
################################

train = pd.read_csv("train_date.csv", index_col=0,)
test = pd.read_csv("test_date.csv", index_col=0)
train_test = pd.concat([train, test])
print('Concatenation OK!')

print(train_test.shape)
ntrain = train.shape[0]
ntest = test.shape[0]

# takes long time!
#dups = duplicate_columns(train_test)
#train_test = train_test.drop(dups, axis=1)
#print('Unduplication OK!')
#print(train_test.shape)

x_train = train_test.iloc[:ntrain, :]
x_test = train_test.iloc[ntrain:, :]

train_ids = x_train.index
test_ids = x_test.index
col_train = x_train.columns
col_test = x_test.columns
train_df = pd.DataFrame(data=x_train, index=train_ids, columns=col_train)
test_df = pd.DataFrame(data=x_test, index=test_ids, columns=col_test)

train_test = pd.concat([train_df, test_df])
train_df.to_pickle('As_is/train_date_unduplicated.pkl')
test_df.to_pickle('As_is/test_date_unduplicated.pkl')
train_test.to_pickle('As_is/train_test_date_unduplicated.pkl')

del(train_df,test_df, x_train, x_test)
gc.collect()

#######################################################################
### Now categoricals

train = pd.read_csv("train_categorical_withoutT.csv", index_col=0)
train.columns = train.columns.str.replace('_','__')
test = pd.read_csv("test_categorical_withoutT.csv", index_col=0)
test.columns = test.columns.str.replace('_','__')
train_test = pd.concat([train, test])
print('Concatenation OK!')

print(train_test.shape)
ntrain = train.shape[0]
ntest = test.shape[0]
del(test, train)


dups = duplicate_columns(train_test)
train_test = train_test.drop(dups, axis=1)
print('Unduplication OK!')
print(train_test.shape)


train = train_test.iloc[:ntrain, :]
test = train_test.iloc[ntrain:, :]
train_ids = train.index
test_ids = test.index
col_train = train.columns
col_test = test.columns
train_df = pd.DataFrame(data=train, index=train_ids, columns=col_train)
test_df = pd.DataFrame(data=test, index=test_ids, columns=col_test)

train_df.to_pickle('As_is/train_cat_df.pkl')
test_df.to_pickle('As_is/test_cat_df.pkl')

del(train_df,test_df)
gc.collect()
print('ok!')
###########################
##Now make train all

pkl_file1 = open('As_is/train_cat_df.pkl', 'rb')
train_cat = pickle.load(pkl_file1)
pkl_file2 = open('As_is/train_date_unduplicated.pkl', 'rb')
train_date = pickle.load(pkl_file2)
pkl_file3 = open('As_is/train_num_df.pkl', 'rb')
train_num = pickle.load(pkl_file3)
X = pd.concat([train_num, train_date, train_cat], axis=1)
X.to_pickle('As_is/train_all_df.pkl')
print('Shape of the train')
print(X.shape)
del(train_date, train_cat, train_num, X)
gc.collect()

######################
## Now test all

pkl_file1 = open('As_is/test_cat_df.pkl', 'rb')
train_cat = pickle.load(pkl_file1)
pkl_file2 = open('As_is/test_date_unduplicated.pkl', 'rb')
train_date = pickle.load(pkl_file2)
pkl_file3 = open('As_is/test_num_df.pkl', 'rb')
train_num = pickle.load(pkl_file3)
X = pd.concat([train_num, train_date, train_cat], axis=1)
X.to_pickle('As_is/test_all_df.pkl') # Whole test
print('Shape of the test')
print(X.shape)
del(train_date, train_cat, train_num, X)
gc.collect()

print('Finished')