## Getting the dataframe ready for Machine Learning

## 1. Data Preprocessing and EDA

In [1]:
# Jupyter Notebook with Matplotlib Inline
%matplotlib notebook
# Import required modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import math

In [15]:
import pandas as pd
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

height has been deprecated.



In [2]:
# Read the cleaned datasets
train = pd.read_csv("train.csv", delimiter = ',') 
test = pd.read_csv("test.csv", delimiter = ',')

In [3]:
# Creating a copy of the 'train' dataframe to work on 
z = train.copy(deep = True)
df = train.copy(deep = True)

In [4]:
test.head().transpose()

Unnamed: 0,0,1,2,3,4
age,36,60,45,27,38
job,blue-collar,admin.,blue-collar,admin.,admin.
marital,married,married,married,married,married
education,basic.9y,high.school,basic.4y,high.school,university.degree
default,no,no,no,no,no
housing,unknown,no,no,no,no
personal,unknown,no,no,no,no
contact_type,telephone,telephone,cellular,cellular,cellular
month,jun,may,may,may,aug
day,tue,mon,tue,thu,thu


## 2. Data Preperation

After the EDA, here are the transformations that will be applied to the data:

2.1 Data Transformation

    1. 'job', 'education' and 'month' -- Consolidating these variables on Percentage of positive and negative responses.
    2. 'age' -- Binning the high outliers (> 75) and low outliers (< 23) into a single bin respectively; and bins of 4/5 for values with in 24 and 75.
    3. 'day' -- Replace day with 'weekday_1', 'weekday_2' and 'weekend' categories.
    4. 'duration' -- Create two dataframes (one with 'duration' column another without). This is covered at the end of this section.
    5. 'marital' variable -- Come back to this step while using Machine Leanring for variable or feature importance


2.2 Treating Outliers
    
    1. Applying Upper and Lower bounds to 'duration' and 'employees' variable  <br>
    2. Applying 90 percentiles and 5 percentiles for the lower and upper outliers  <br>
    3. Apply Logarithmic transformations to invalid outliers (Not to the outliers but to all the entries in the numeric columns). <br>

<u>Note:</u> `marital` variable - Come back to this step while using Machine Leanring for variable or feature importance

## 2.1 Data Transformation

### Transformation 1 : 'job', 'education' and 'month' 

Consolidating 'job', 'education' and 'month' variables (Percentage of positive and negative responses)

<img src="Category Consolidation.png">

In [5]:
# Consolidate 'job', 'education' and 'month' variables based on percentage of positive and negative responses.
z['job'].replace(['blue-collar', 'services', 'entrepreneur', 'housemaid', 'self-employed', 'technician', 
                  'management', 'unknown', 'admin.', 'unemployed', 'retired', 'student'],
                   ['j1l4', 'j1l4', 'j1l4', 'j1l3', 'j1l3', 'j1l3', 'j1l3', 'j1l2', 'j1l2', 'j1l2', 'j1l1', 'j1l1'], 
                   inplace=True)

z['education'].replace(['basic.9y','basic.6y','basic.4y','high.school','professional.course','university.degree','unknown','illiterate'],
                      ['e1l4','e1l4','e1l3','e1l3','e1l3','e1l2','e1l2','e1l1'], 
                      inplace=True)

z['month'].replace(['may','jul','nov','aug','jun','apr','oct','sep','dec','mar'],
                      ['m1l3','m1l3','m1l3','m1l3','m1l3','m1l2','m1l1','m1l1','m1l1','m1l1'], 
                      inplace=True)

### Transformation 2 :  'age' Variable

Given the data is highly imbalanced, 'age' is categorised into bins based using 'qcut' rather than 'cut'

In [17]:
age_groups = ['young_adult', 'adult', 'senior']

In [19]:
z['age_group'] = pd.qcut(df['age'], 3, labels = age_groups)

In [20]:
pd.crosstab(z.age_group, df.age).transpose()

age_group,adult,senior,young_adult
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17,0,0,2
18,0,0,22
19,0,0,36
20,0,0,54
21,0,0,90
22,0,0,109
23,0,0,173
24,0,0,359
25,0,0,482
26,0,0,550


In [21]:
z['age_group'].value_counts()

young_adult    11809
adult          10761
senior         10380
Name: age_group, dtype: int64

In [None]:
----------------------------------------------------------------------------------------------------------------------------

<div class="alert alert-warning">
Binning the high outliers (> 75) and low outliers (< 23) into a single bin respectively; and bins of 4/5 for values with in 24 and 75

# Single bin[0-20]....bins of 4 [24-60]....bins of 5 [60-75]....single bin [75-100]  <br>
def final_test(a): <br>
    if(a>0 and a<=24):<br>
        return "Cat 1"<br>
    elif (a>24 and a<=31):<br>
        return "Cat 2"<br>
    elif (a>31 and a<=35):<br>
        return "Cat 3"<br>
    elif (a>35 and a<=41):<br>
        return "Cat 4"<br><br>
    elif (a>41 and a<=49):<br>
        return "Cat 5"<br>
    elif (a>49 and a<=60):<br>
        return "Cat 6"<br>
    elif (a>60 and a<=100):<br>
        return "Cat 7"<br>
        <br>
z['age_cat'] = z.apply(lambda row: final_test(row['age']), axis=1)<br>
df['age_cat'] = df.apply(lambda row: final_test(row['age']), axis=1)<br>
# df_duration_bounds['age_cat'] = df_duration_bounds.apply(lambda row: final_test(row['age']), axis=1) <br>
# df_employees_bounds['age_cat'] = df_duration_bounds.apply(lambda row: final_test(row['age']), axis=1) <br>

</div>

In [None]:
----------------------------------------------------------------------------------------------------------------------------

### Transformation 3: Merging 'marital' and 'age'

In [34]:
pd.crosstab(z.marital, z.age)

age,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,91,92,94,95,98
marital,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1
divorced,0,0,0,0,0,0,0,3,11,9,13,15,51,73,121,102,106,109,141,115,134,135,119,113,143,135,88,120,155,121,123,112,120,101,105,119,107,98,98,107,96,78,72,35,5,3,4,4,3,7,5,5,3,9,2,7,6,9,6,3,7,8,0,10,8,6,4,3,3,2,1,13,1,0,1,0,1,0
married,0,0,0,1,7,13,25,67,121,153,209,296,437,604,722,745,852,828,865,878,722,708,784,625,683,648,630,556,626,587,534,611,502,539,478,465,442,409,382,464,379,359,290,171,54,45,39,44,27,42,17,18,22,31,35,18,21,17,14,25,11,14,11,14,7,7,9,2,9,5,0,3,0,1,2,1,0,2
single,2,22,36,53,83,96,148,289,349,388,466,488,671,681,730,643,525,418,418,384,315,290,231,182,180,135,129,143,98,87,83,74,60,55,34,43,35,29,32,18,45,18,14,9,3,1,0,0,1,4,0,4,0,0,4,2,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
unknown,0,0,0,0,0,0,0,0,1,0,1,0,3,4,6,5,3,5,3,1,4,2,0,1,1,0,0,0,2,0,0,0,3,3,0,2,1,1,2,1,2,2,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [62]:
z['age_marital'] = z.apply(lambda x: x['age_group'] + ' & ' + x['marital'], axis = 1)

In [64]:
z['age_marital'].value_counts()

senior & married          7805
adult & married           7099
young_adult & single      6088
young_adult & married     5080
adult & single            2407
senior & divorced         1796
adult & divorced          1243
senior & single            757
young_adult & divorced     613
young_adult & unknown       28
senior & unknown            22
adult & unknown             12
Name: age_marital, dtype: int64

In [65]:
z.head().transpose()

Unnamed: 0,0,1,2,3,4
age,41,55,55,36,32
job,admin.,management,admin.,admin.,entrepreneur
marital,single,married,married,divorced,single
education,professional.course,basic.9y,high.school,university.degree,university.degree
default,no,no,no,no,no
housing,yes,yes,yes,no,yes
personal,yes,no,no,yes,no
contact_type,cellular,cellular,cellular,cellular,telephone
month,apr,jul,apr,nov,may
day,fri,wed,wed,thu,tue


### Transformation 4 - 'day' variable 

Replace day with 'weekday_1', 'weekday_2' and 'weekend' categories.

In [67]:
# Replaced day with 'weekday_1', 'weekday_2' and 'weekend' categories.
for dataframe in (z, df):
    dataframe['day_cat'] = dataframe['day'].copy(deep=True)
    dataframe['day_cat'].replace(['sun', 'sat', 'mon', 'tue', 'wed', 'thu', 'fri'],
                      ['weekend', 'weekend', 'weekday_1', 'weekday_1', 'weekday_1', 'weekday_2', 'weekday_2'], 
                      inplace=True)

### Transformation 5 - 'duration' Variable
<u>Important note:</u> this attribute highly affects the output target (e.g., if duration=0 then y='no'). 

Yet, the duration is not known before a call is performed. 
Also, after the end of the call y is obviously known. 
Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

 <u>Notes</u>: del df['column_name']
 
 Advantage of drop over del is that drop allows you to drop multiple columns at once, 
 perform the operation inplace or not, and also delete records along any axis (especially useful for a 3-D matrix

<center> <b> More on this at the end of this section </center>

### Transformation 6 - 'Marital' variable

All classes in this variable are similarly distributed.

<u>Note:</u>  Come back to this step while using Machine Leanring for variable or feature importance

In [70]:
# All categories are almost similarly distributed (83%-90% for NOs)............
pd.crosstab(z.marital, z.y, normalize='index')

y,no,yes
marital,Unnamed: 1_level_1,Unnamed: 2_level_1
divorced,0.893209,0.106791
married,0.897718,0.102282
single,0.859706,0.140294
unknown,0.83871,0.16129


## 2.2 Treating Outliers 

1. Replace valid outliers with logarithmic transformation
2. Replace invalid outliers (human-error) with 90th percentile or upper bounds OR exclude the record from the dataframe. 

IQR - https://www.youtube.com/watch?v=dNHGVLXBTgI

#### (a) Applying Upper and Lower bounds to 'duration' and 'employees' variable 

In [71]:
# Upper and Lower bounds for 'duration' column
z['duration'] = z['duration'].apply(lambda x: int(math.floor(x / 10.0)) * 10 if(x%10<5) else int(math.ceil(x / 10.0)) * 10 )
z['employees'] = z['employees'].apply(lambda x: int(math.floor(x / 10.0)) * 10 if(x%10<5) else int(math.ceil(x / 10.0)) * 10 ) 

#### (b) Applying 90 percentiles and 10 percentiles for the lower and upper outliers 

In [72]:
uq = 0.95
lq = 0.05

In [73]:
colz = ['duration', 'dcontacts', 'pdays', 'evr', 'cpi', 'cci', 'euribor', 'employees']

In [74]:
for col in colz:
    z[col] = z[col].clip_upper(int(z[col].quantile(uq)))
    z[col] = z[col].clip_lower(int(z[col].quantile(lq)))

#### (c) Apply Logarithmic transformations (Not just to the outliers but to all the samples (observations) in the numeric columns). 

Creating a new dataframe to apply logarithm transformations.
From all the numerical columns, logarithmic transformations can be applied to only a few since others have '0' and negative values

Excluded

1. cci - (negative)
2. pcontacts - (zero)
3. evr (negative)

Included but not sure
    
1. duration - 0
2. pdays - 0
3. pcontacts - 0
4. pdays - (value 999 - means client was not contacted previously)


In [206]:
# z.astype(bool).sum(axis=0)      # Count of zeros in a columns
# z[z<0].count()                  # Count of negative values in each column

In [75]:
# num = ['age','dcontacts','cpi','euribor','employees','duration_outliers','dcontacts_outliers','pdays_outliers','euribor_outliers','employees_outliers']
num = ['age','dcontacts', 'cpi', 'euribor','employees']

In [76]:
z_log = z.copy(deep=True)
for n in num:
    z_log[n] = np.log(z_log[n])

In [77]:
z_log.head().transpose()

Unnamed: 0,0,1,2,3,4
age,3.71357,4.00733,4.00733,3.58352,3.46574
job,admin.,management,admin.,admin.,entrepreneur
marital,single,married,married,divorced,single
education,professional.course,basic.9y,high.school,university.degree,university.degree
default,no,no,no,no,no
housing,yes,yes,yes,no,yes
personal,yes,no,no,yes,no
contact_type,cellular,cellular,cellular,cellular,telephone
month,apr,jul,apr,nov,may
day,fri,wed,wed,thu,tue


## 3. Data Rescaling

Preprocessed data may contain attributes with a mixtures of scales for various quantities such as dollars, kilograms 
and sales volume.
Many machine learning methods expect or are more effective if the data attributes have the same scale. 

Two popular data scaling methods are normalization and standardization.

1. Data Normalization
2. Data Standardization

** Normalization **:
It refers to rescaling real valued numeric attributes into the range 0 and 1.
It is useful to scale the input attributes for a model that relies on the magnitude of values, 
such as distance measures used in k-nearest neighbors and in the preparation of coefficients in regression. <br>

<u>Advantages</u>: ML algorithms such as Linear Regression and SVM perform faster on normalized data. 

** Standardization **:
Standardization refers to shifting the distribution of each attribute to have a mean of zero and a 
standard deviation of one (unit variance).
It is useful to standardize attributes for a model that relies on the distribution of attributes such as Gaussian processes.

**Which Method To Use**:
It is hard to know whether rescaling your data will improve the performance of your algorithms before you apply them. 
If often can, but not always.

A good tip is to create rescaled copies of your dataset and race them against each other using your test harness 
and a handful of algorithms you want to spot check. This can quickly highlight the benefits (or lack there of) of 
rescaling your data with given models, and which rescaling method may be worthy of further investigation.

### Normalizing and Standardizing the data

In [78]:
z_normalized = z.copy(deep = True)   # All changes applied
z_standardized = z.copy(deep = True) # All changes applied

df_duration_yes = z.copy(deep=True)
df_duration_no = z.copy(deep=True)

df_duration_yes_normalized = df_duration_yes.copy(deep = True)
df_duration_yes_standardized = df_duration_yes.copy(deep = True)

df_duration_no_normalized = df_duration_no.copy(deep = True)
df_duration_no_standardized = df_duration_no.copy(deep = True)

z_log_normalized = z_log.copy(deep=True)
z_log_standardized = z_log.copy(deep=True)

In [79]:
z_log.head(1).transpose()

Unnamed: 0,0
age,3.71357
job,admin.
marital,single
education,professional.course
default,no
housing,yes
personal,yes
contact_type,cellular
month,apr
day,fri


In [80]:
numerical = ['age','duration','dcontacts','pdays','pcontacts','evr','cpi','cci','euribor','employees']

In [81]:
for dataframe in (z_normalized, df_duration_yes_normalized, df_duration_no_normalized, z_log_normalized):
    for n in numerical:
        col = dataframe[[n]].values.astype(float)
        col_transformed = (preprocessing.MinMaxScaler()).fit_transform(col)
        dataframe[n+'_normalized'] = pd.DataFrame(col_transformed)

for dataframe in (z_standardized, df_duration_yes_standardized, df_duration_no_standardized, z_log_standardized):
    for n in numerical:
        col = dataframe[[n]].values.astype(float)
        col_transformed = (preprocessing.StandardScaler()).fit_transform(col)
        dataframe[n+'_standardized'] = pd.DataFrame(col_transformed)

Delete 'duration' column

In [None]:
# df_duration_no_standardized.drop('duration',axis=1, inplace=True) #axis = 1 deletes column; axis = 0 delets rows
# df_duration_no_normalized.drop('duration',axis=1, inplace=True)
# df_duration_no.drop('duration',axis=1, inplace=True)

In [82]:
df_duration_no_normalized.drop('duration',axis=1, inplace=True)
df_duration_no_standardized.drop('duration',axis=1, inplace=True)

## 4. Data Resampling (Upsample and Downsample)

In [83]:
from sklearn.utils import resample

In [84]:
z.head().transpose()

Unnamed: 0,0,1,2,3,4
age,41,55,55,36,32
job,admin.,management,admin.,admin.,entrepreneur
marital,single,married,married,divorced,single
education,professional.course,basic.9y,high.school,university.degree,university.degree
default,no,no,no,no,no
housing,yes,yes,yes,no,yes
personal,yes,no,no,yes,no
contact_type,cellular,cellular,cellular,cellular,telephone
month,apr,jul,apr,nov,may
day,fri,wed,wed,thu,tue


In [88]:
# Upsampling Data - z_upsample
major_class = z[z.y == 'no']
minor_class = z[z.y == 'yes']
    
z_minor_upsample = resample(minor_class, replace = True, n_samples = len(major_class), random_state = 42)
z_upsample = pd.concat([major_class, z_minor_upsample])
    
print(z_upsample.y.value_counts())

no     29208
yes    29208
Name: y, dtype: int64


In [37]:
z_resample = train.copy(deep = True)

In [39]:
z_resample_major = z_resample[z_resample.y == 'no']
z_resample_minor = z_resample[z_resample.y == 'yes']

In [40]:
print(len(z_resample_major), len(z_resample_minor))

29208 3742


In [41]:
z_minor_upsampled = resample(z_resample_minor, 
                                 replace = True,     # sample with replacement
                                 n_samples = len(z_resample_major),    # to match majority class
                                 random_state = 42) # reproducible results

In [42]:
len(z_minor_upsampled)

29208

In [43]:
z_upsampled = pd.concat([z_resample_major, z_minor_upsampled])

In [44]:
z_upsampled.y.value_counts()

yes    29208
no     29208
Name: y, dtype: int64

In [91]:
# Downsampling Data - z_downsample
major_class = z[z.y == 'no']
minor_class = z[z.y == 'yes']
    
z_major_downsample = resample(major_class, replace = False, n_samples = len(minor_class), random_state = 42)
z_downsample = pd.concat([z_major_downsample, minor_class])

print(z_downsample.y.value_counts())

yes    3742
no     3742
Name: y, dtype: int64


# 5. Dummy Variables

In [96]:
z_age_marital = z.copy(deep = True)

In [113]:
z = z.drop(['age', 'marital', 'age_group', 'day'], 1)
z_age_martial = z_age_marital.drop(['age', 'marital', 'day', 'age_group'], 1)
z_upsample = z_upsample.drop(['age', 'marital', 'day', 'age_group'], 1)
z_downsample = z_downsample.drop(['age', 'marital', 'day', 'age_group'], 1)

In [118]:
all_df = [z, z_upsample, z_downsample, z_age_martial]

In [119]:
# Verifying if the dataframes have any 'categorical' columns
for dataframe in all_df:
    print(dataframe.select_dtypes(include=['O']).columns.values)

['job' 'education' 'default' 'housing' 'personal' 'contact_type' 'month'
 'poutcome' 'y' 'age_marital' 'day_cat']
['job' 'education' 'default' 'housing' 'personal' 'contact_type' 'month'
 'poutcome' 'y' 'age_marital' 'day_cat']
['job' 'education' 'default' 'housing' 'personal' 'contact_type' 'month'
 'poutcome' 'y' 'age_marital' 'day_cat']
['job' 'education' 'default' 'housing' 'personal' 'contact_type' 'month'
 'poutcome' 'y' 'age_marital' 'day_cat']


In [125]:
categorical_fields = ['job', 'education', 'default', 'housing', 'personal', 'contact_type',
 'month', 'poutcome', 'age_marital', 'day_cat']

In [126]:
all_df = [pd.get_dummies(df, columns = categorical_fields) for df in all_df]

In [127]:
# Verifying if the dataframes have any 'categorical' columns
for dataframe in all_df:
    print(dataframe.select_dtypes(include=['O']).columns.values)

['y']
['y']
['y']
['y']


## 6. Machine Learning

In [45]:
train.head()

Unnamed: 0,age,job,marital,education,default,housing,personal,contact_type,month,day,...,dcontacts,pdays,pcontacts,poutcome,evr,cpi,cci,euribor,employees,y
0,41,admin.,single,professional.course,no,yes,yes,cellular,apr,fri,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,no
1,55,management,married,basic.9y,no,yes,no,cellular,jul,wed,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no
2,55,admin.,married,high.school,no,yes,no,cellular,apr,wed,...,3,999,1,failure,-1.8,93.075,-47.1,1.415,5099.1,no
3,36,admin.,divorced,university.degree,no,no,yes,cellular,nov,thu,...,2,999,0,nonexistent,-0.1,93.2,-42.0,4.076,5195.8,no
4,32,entrepreneur,single,university.degree,no,yes,no,telephone,may,tue,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score

In [63]:
df.head()

Unnamed: 0,age,duration,dcontacts,pdays,pcontacts,evr,cpi,cci,euribor,employees,...,y_yes,age_cat_Cat 1,age_cat_Cat 2,age_cat_Cat 3,age_cat_Cat 4,age_cat_Cat 5,age_cat_Cat 6,age_cat_Cat 7,day_cat_weekday_1,day_cat_weekday_2
0,41,41,1,999,0,-1.8,93.075,-47.1,1.405,5099.1,...,0,0,0,0,1,0,0,0,0,1
1,55,421,2,999,0,1.4,93.918,-42.7,4.963,5228.1,...,0,0,0,0,0,0,1,0,1,0
2,55,140,3,999,1,-1.8,93.075,-47.1,1.415,5099.1,...,0,0,0,0,0,0,1,0,1,0
3,36,56,2,999,0,-0.1,93.2,-42.0,4.076,5195.8,...,0,0,0,0,1,0,0,0,0,1
4,32,242,2,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,0,0,1,0,0,0,0,1,0


In [None]:
X_train = df.drop('y', 1)
y_train = df[['y']]
X_test = df.drop('y', 1)
y_test = df[['y']]