In [74]:
import pandas as pd
import numpy as np
import time
import matplotlib
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import missingno as msno
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import gc
%matplotlib inline

In [2]:
train = pd.read_csv('data/train.csv', encoding='utf-8', sep=',')

test = pd.read_csv('data/test.csv', encoding='utf-8', sep=',')

In [3]:
all_df = pd.concat((train, test), axis=0)

In [4]:
all_df['target'].head()

0    38000000.0
1      600000.0
2    10000000.0
3     2000000.0
4    14400000.0
Name: target, dtype: float64

In [5]:
all_df['target'] = np.log1p(all_df['target'])

In [6]:
all_df['target'].head()

0    17.453097
1    13.304687
2    16.118096
3    14.508658
4    16.482739
Name: target, dtype: float64

In [7]:
float_cols = list()
for col in all_df.columns:
    if all_df[col].dtype == 'float64':
        float_cols.append(col)

In [8]:
len(float_cols)

4992

In [58]:
float_cols = list()
for col in all_df.columns:
    if col not in ['ID', 'target']:
        if all_df[col].dtype == 'float64':
            for i in all_df[col]:
                if i%1 != 0:
                    float_cols.append(col)
                    break
                else:
                    pass

In [60]:
len(float_cols), len(all_df.columns)

(4991, 4993)

In [61]:
ss = StandardScaler()

In [68]:
def scaleColumns(df, cols_to_scale):
    for col in cols_to_scale:
        df[col] = pd.DataFrame(ss.fit_transform(pd.DataFrame(df[col])),columns=[col])
    return df

In [69]:
scaled_all_df = scaleColumns(all_df, float_cols)

Unnamed: 0,001476ffa,0019109c4,0022de2b3,0024cd760,002d634dc,00302fe51,003da5628,006e72749,007d71f12,007ee91d1,...,ffa6b80e2,ffa903344,ffb34b926,ffca57b7b,ffcec956f,ffd2f9409,ffd50f0bf,ffdc4bcf8,ffec49dae,target
0,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.031970,17.453097
1,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.031970,13.304687
2,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.031970,16.118096
3,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.031970,14.508658
4,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.031970,16.482739
5,0.502765,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,0.568259,-0.026248,14.845130
6,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.031970,12.007628
7,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.031970,13.304687
8,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.031970,13.794288
9,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.031970,13.038984


In [78]:
scaled_all_df.head()

Unnamed: 0,001476ffa,0019109c4,0022de2b3,0024cd760,002d634dc,00302fe51,003da5628,006e72749,007d71f12,007ee91d1,...,ffa6b80e2,ffa903344,ffb34b926,ffca57b7b,ffcec956f,ffd2f9409,ffd50f0bf,ffdc4bcf8,ffec49dae,target
0,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.03197,17.453097
1,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.03197,13.304687
2,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.03197,16.118096
3,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.03197,14.508658
4,-0.056086,-0.032696,-0.025481,-0.048058,-0.041888,-0.036611,-0.052255,-0.025899,-0.029453,-0.032681,...,-0.071927,-0.036566,-0.054832,-0.031014,-0.052007,-0.033871,-0.023249,-0.058426,-0.03197,16.482739


In [73]:
scaled_all_df['0022de2b3'].head()

0   -0.025481
1   -0.025481
2   -0.025481
3   -0.025481
4   -0.025481
Name: 0022de2b3, dtype: float64

In [75]:
del all_df
gc.collect()

235

In [79]:
train_df = scaled_all_df.iloc[:len(train)]
test_df = scaled_all_df.iloc[len(train):]
del scaled_all_df
gc.collect()

75

In [81]:
train_df.to_csv('data/scaled_train.csv', sep = ',', index = False)
test_df.to_csv('data/scaled_test.csv', sep = ',', index = False)

In [82]:
def rmsle(y, pred):
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(pred), 2)))