In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split

#Step 1: Data Cleaning#

After we collected the data from the website, we need to clean the data before fitting any models. In the original data, each row represents a person's information and his/her evaluation on another person and himself/herself. We first combined the personal information for a person *A* and a person *B*. Then we cleaned some features that either have empty entries or non-consistent entries. Later on we also creates some new features for model fitting. 

We first load the Speed Dating data and delete all the data with NaN in 'piid' columns. 

In [2]:
df = pd.read_csv("SpeedDating.csv")
df = df[~np.isnan(df['piid'])]
df.shape

(8289, 82)

In [37]:
df['piid'] = [int(i) for i in df['piid']] 

Among "from" columns, converting input with lower "i" to upper "I", so that both "india" and "India" will be considered as the same country.

In [38]:
df.loc[df['from'][df['from']=='india'].index, 'from']='India'

Delete unuseful columns

In [39]:
del df['id'], df['idg'], df['condtn'], df['wave'], df['pid'], df['age_o'], df['race_o'], df['pf_o_att'], df['pf_o_sin']
del df ['pf_o_int'], df['pf_o_fun'], df['pf_o_amb'], df['pf_o_sha'], df['dec'], df['met'], df['satis_2'], df['length'], df['numdat_2']
del df['mn_sat'], df['tuition'], df['round'], df['int_corr'], df['income'], df['exphappy'], df['samerace']

In [40]:
print len(df.columns)
df.head()

57


Unnamed: 0,iid,gender,piid,match,age,field_cd,race,imprace,imprelig,from,goal,date,go_out,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr2_1,sinc2_1,intel2_1,fun2_1,amb2_1,shar2_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr,sinc,intel,fun,amb,shar,like,prob,undergra
0,2,0,11,0,24,1,2,2,5,AL,1,5,1,1,3,2,7,10,8,6,3,5,8,10,1,9,8,7,8,3,1,45,5,25,20,0,5,65,0,10,25,0,0,7,5,10,8,3,5,7,8,4,6,3,6,4,
1,2,0,12,0,24,1,2,2,5,AL,1,5,1,1,3,2,7,10,8,6,3,5,8,10,1,9,8,7,8,3,1,45,5,25,20,0,5,65,0,10,25,0,0,7,5,10,8,3,8,5,6,6,9,6,7,3,
2,2,0,13,0,24,1,2,2,5,AL,1,5,1,1,3,2,7,10,8,6,3,5,8,10,1,9,8,7,8,3,1,45,5,25,20,0,5,65,0,10,25,0,0,7,5,10,8,3,5,8,9,6,3,4,6,7,
3,2,0,14,1,24,1,2,2,5,AL,1,5,1,1,3,2,7,10,8,6,3,5,8,10,1,9,8,7,8,3,1,45,5,25,20,0,5,65,0,10,25,0,0,7,5,10,8,3,7,9,7,6,5,7,7,8,
4,2,0,15,0,24,1,2,2,5,AL,1,5,1,1,3,2,7,10,8,6,3,5,8,10,1,9,8,7,8,3,1,45,5,25,20,0,5,65,0,10,25,0,0,7,5,10,8,3,6,8,7,9,7,8,8,6,


Convert the columns 'field_cd','race','career_c' to binary indicators

In [41]:
dftest=df[['field_cd','race','career_c']]
enc = OneHotEncoder()

enc.fit(dftest)
new_cols_data=enc.transform(dftest).toarray()

In [42]:
new_cols_f=['f1law', 'f2math', 'f3social_sci', 'f4medical_sci', 'f5engineering', 'f6english', 'f7hist', 'f8business', 'f9edu', 'f10bio_sci', 'f11social_work', 'f12undergrad', 'f13poli_sci', 'f14film', 'f15arts', 'f16languages', 'f17architecture', 'f18other']
new_cols_r=['r1black', 'r2european', 'r3latino', 'r4asian', 'r6other']
new_cols_c=['c1lawyer', 'c2academic', 'c3psycho', 'c4doctor', 'c5engineer', 'c6entertainment', 'c7banking', 'c8real_estate', 'c9inter_affairs', 'c10undeci', 'c11social_work','c12speech', 'c13politics', 'c14sports', 'c15other', 'c16journalism', 'c17architecture']
new_cols=new_cols_f+new_cols_r+new_cols_c
for i in range(len(new_cols)):
    df[new_cols[i]]=new_cols_data[:, i]

Convert 'goal' to a binary indicator, with "1" representing "serious purpose" and "0" representing "for fun".

In [43]:
Y = np.array([1 if y in [3, 4] else 0 for y in  df['goal'].values])
df['goal_s']=Y

In [44]:
df.head(5)

Unnamed: 0,iid,gender,piid,match,age,field_cd,race,imprace,imprelig,from,goal,date,go_out,career_c,sports,tvsports,exercise,dining,museums,art,hiking,gaming,clubbing,reading,tv,theater,movies,concerts,music,shopping,yoga,attr1_1,sinc1_1,intel1_1,fun1_1,amb1_1,shar1_1,attr2_1,sinc2_1,intel2_1,fun2_1,amb2_1,shar2_1,attr3_1,sinc3_1,fun3_1,intel3_1,amb3_1,attr,sinc,intel,fun,amb,shar,like,prob,undergra,f1law,f2math,f3social_sci,f4medical_sci,f5engineering,f6english,f7hist,f8business,f9edu,f10bio_sci,f11social_work,f12undergrad,f13poli_sci,f14film,f15arts,f16languages,f17architecture,f18other,r1black,r2european,r3latino,r4asian,r6other,c1lawyer,c2academic,c3psycho,c4doctor,c5engineer,c6entertainment,c7banking,c8real_estate,c9inter_affairs,c10undeci,c11social_work,c12speech,c13politics,c14sports,c15other,c16journalism,c17architecture,goal_s
0,2,0,11,0,24,1,2,2,5,AL,1,5,1,1,3,2,7,10,8,6,3,5,8,10,1,9,8,7,8,3,1,45,5,25,20,0,5,65,0,10,25,0,0,7,5,10,8,3,5,7,8,4,6,3,6,4,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,12,0,24,1,2,2,5,AL,1,5,1,1,3,2,7,10,8,6,3,5,8,10,1,9,8,7,8,3,1,45,5,25,20,0,5,65,0,10,25,0,0,7,5,10,8,3,8,5,6,6,9,6,7,3,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,0,13,0,24,1,2,2,5,AL,1,5,1,1,3,2,7,10,8,6,3,5,8,10,1,9,8,7,8,3,1,45,5,25,20,0,5,65,0,10,25,0,0,7,5,10,8,3,5,8,9,6,3,4,6,7,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,0,14,1,24,1,2,2,5,AL,1,5,1,1,3,2,7,10,8,6,3,5,8,10,1,9,8,7,8,3,1,45,5,25,20,0,5,65,0,10,25,0,0,7,5,10,8,3,7,9,7,6,5,7,7,8,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2,0,15,0,24,1,2,2,5,AL,1,5,1,1,3,2,7,10,8,6,3,5,8,10,1,9,8,7,8,3,1,45,5,25,20,0,5,65,0,10,25,0,0,7,5,10,8,3,6,8,7,9,7,8,8,6,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
df.shape

(8289, 98)

First seperate the dataset into two datasets wdf and mdf.

wdf includes all the rows whose participants('iid') are women('gender==0').

mdf includes all the rows whose participants('iid') are men('gender==1').

Rename 'iid' and 'piid' with respect to gender.

In [46]:
wdf = df[df['gender']==0].rename(columns = {'iid':'id_w', 'piid':'id_m'})
mdf = df[df['gender']==1].rename(columns = {'iid':'id_m', 'piid':'id_w'})

Merge wdf and mdf according to 'id_m' and 'id_w' so that each row will contain information of both the participant and the partner.

After merging, all the columns associated with 'iid' will be ended with '_x'; on the contrary, all the columns associated with 'piid' will be ended with '_y'. 

In [47]:
df1=pd.merge(mdf, wdf, how='inner', on=['id_m', 'id_w'])
df1=df1.rename(columns = {'id_m':'iid', 'id_w':'piid'})
df2=pd.merge(wdf, mdf, how='inner', on=['id_m', 'id_w'])
df2=df2.rename(columns = {'id_w':'iid', 'id_m':'piid'})
df_new=df1.append(df2)

Since we will make prediction with respect to 'iid', we delete those prediction result columns associated with 'piid'.

In [48]:
del df_new['gender_y'], df_new['match_y'], df_new['like_y'], df_new['prob_y']

Seperate train and test data

In [53]:
itrain, itest = train_test_split(xrange(df_new.shape[0]), train_size=0.7)

mask=np.ones(df_new.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

In [54]:
print df_new[mask].shape
print df_new[~mask].shape

(5094, 190)
(2184, 190)


Standardize all numerical variables

In [55]:
STANDARDIZABLE = [
                'age_x', 'imprace_x', 'imprelig_x', 'date_x', 'go_out_x', 'sports_x', 'tvsports_x', 'exercise_x', 'dining_x', 'museums_x', 'art_x', 'hiking_x', 'gaming_x', 'clubbing_x', 'reading_x', 'tv_x', 'theater_x', 'movies_x', 'concerts_x', 'music_x', 'shopping_x', 'yoga_x', 'attr1_1_x', 'sinc1_1_x', 'intel1_1_x', 'fun1_1_x', 'amb1_1_x', 'shar1_1_x', 'attr2_1_x', 'sinc2_1_x', 'intel2_1_x', 'fun2_1_x', 'amb2_1_x', 'shar2_1_x', 'attr3_1_x', 'sinc3_1_x', 'fun3_1_x', 'intel3_1_x', 'amb3_1_x', 'attr_x', 'sinc_x', 'intel_x', 'fun_x', 'amb_x', 'shar_x', 
                'age_y', 'imprace_y', 'imprelig_y', 'date_y', 'go_out_y', 'sports_y', 'tvsports_y', 'exercise_y', 'dining_y', 'museums_y', 'art_y', 'hiking_y', 'gaming_y', 'clubbing_y', 'reading_y', 'tv_y', 'theater_y', 'movies_y', 'concerts_y', 'music_y', 'shopping_y', 'yoga_y', 'attr1_1_y', 'sinc1_1_y', 'intel1_1_y', 'fun1_1_y', 'amb1_1_y', 'shar1_1_y', 'attr2_1_y', 'sinc2_1_y', 'intel2_1_y', 'fun2_1_y', 'amb2_1_y', 'shar2_1_y', 'attr3_1_y', 'sinc3_1_y', 'fun3_1_y', 'intel3_1_y', 'amb3_1_y', 'attr_y', 'sinc_y', 'intel_y', 'fun_y', 'amb_y', 'shar_y', 
                'like_x', 'prob_x']

In [56]:
len(STANDARDIZABLE)

92

In [57]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(df_new[mask][STANDARDIZABLE])
df_new[STANDARDIZABLE]=scaler.transform(df_new[STANDARDIZABLE])  

In [58]:
df_new.head()

Unnamed: 0,iid,gender_x,piid,match_x,age_x,field_cd_x,race_x,imprace_x,imprelig_x,from_x,goal_x,date_x,go_out_x,career_c_x,sports_x,tvsports_x,exercise_x,dining_x,museums_x,art_x,hiking_x,gaming_x,clubbing_x,reading_x,tv_x,theater_x,movies_x,concerts_x,music_x,shopping_x,yoga_x,attr1_1_x,sinc1_1_x,intel1_1_x,fun1_1_x,amb1_1_x,shar1_1_x,attr2_1_x,sinc2_1_x,intel2_1_x,fun2_1_x,amb2_1_x,shar2_1_x,attr3_1_x,sinc3_1_x,fun3_1_x,intel3_1_x,amb3_1_x,attr_x,sinc_x,...,intel3_1_y,amb3_1_y,attr_y,sinc_y,intel_y,fun_y,amb_y,shar_y,undergra_y,f1law_y,f2math_y,f3social_sci_y,f4medical_sci_y,f5engineering_y,f6english_y,f7hist_y,f8business_y,f9edu_y,f10bio_sci_y,f11social_work_y,f12undergrad_y,f13poli_sci_y,f14film_y,f15arts_y,f16languages_y,f17architecture_y,f18other_y,r1black_y,r2european_y,r3latino_y,r4asian_y,r6other_y,c1lawyer_y,c2academic_y,c3psycho_y,c4doctor_y,c5engineer_y,c6entertainment_y,c7banking_y,c8real_estate_y,c9inter_affairs_y,c10undeci_y,c11social_work_y,c12speech_y,c13politics_y,c14sports_y,c15other_y,c16journalism_y,c17architecture_y,goal_s_y
0,11,1,1,0,0.16801,8,2,1.112494,-0.246625,Argentina,1,0.000811,1.628324,2,0.618207,0.868264,-1.753154,-0.992693,0.000579,-0.770313,-0.281336,0.397651,-0.69091,0.65867,-1.305903,-1.276804,0.04145,0.051963,0.081866,-0.244435,-1.236412,0.956555,0.385355,-0.041372,0.411631,-1.734237,-1.048468,-0.333824,-1.183653,0.895831,0.276668,1.919426,-1.093664,0.661365,0.506465,-0.454609,-0.37245,-1.479011,-0.086418,0.482822,...,-0.364839,-0.334812,-0.084502,1.07874,-0.225632,0.339016,-0.462392,-0.191561,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,11,1,2,0,0.16801,8,2,1.112494,-0.246625,Argentina,1,0.000811,1.628324,2,0.618207,0.868264,-1.753154,-0.992693,0.000579,-0.770313,-0.281336,0.397651,-0.69091,0.65867,-1.305903,-1.276804,0.04145,0.051963,0.081866,-0.244435,-1.236412,0.956555,0.385355,-0.041372,0.411631,-1.734237,-1.048468,-0.333824,-1.183653,0.895831,0.276668,1.919426,-1.093664,0.661365,0.506465,-0.454609,-0.37245,-1.479011,0.953656,-0.102563,...,-0.364839,-2.588942,-0.601874,-0.100403,0.428162,-1.230936,-0.462392,-1.194969,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,11,1,3,0,0.16801,8,2,1.112494,-0.246625,Argentina,1,0.000811,1.628324,2,0.618207,0.868264,-1.753154,-0.992693,0.000579,-0.770313,-0.281336,0.397651,-0.69091,0.65867,-1.305903,-1.276804,0.04145,0.051963,0.081866,-0.244435,-1.236412,0.956555,0.385355,-0.041372,0.411631,-1.734237,-1.048468,-0.333824,-1.183653,0.895831,0.276668,1.919426,-1.093664,0.661365,0.506465,-0.454609,-0.37245,-1.479011,0.433619,0.482822,...,0.57593,0.228721,0.43287,1.07874,1.735749,0.339016,0.705684,1.815253,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,11,1,4,0,0.16801,8,2,1.112494,-0.246625,Argentina,1,0.000811,1.628324,2,0.618207,0.868264,-1.753154,-0.992693,0.000579,-0.770313,-0.281336,0.397651,-0.69091,0.65867,-1.305903,-1.276804,0.04145,0.051963,0.081866,-0.244435,-1.236412,0.956555,0.385355,-0.041372,0.411631,-1.734237,-1.048468,-0.333824,-1.183653,0.895831,0.276668,1.919426,-1.093664,0.661365,0.506465,-0.454609,-0.37245,-1.479011,-0.086418,-0.102563,...,-1.305608,0.228721,-1.119247,1.668311,0.428162,-0.707618,0.705684,0.811846,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,11,1,5,0,0.16801,8,2,1.112494,-0.246625,Argentina,1,0.000811,1.628324,2,0.618207,0.868264,-1.753154,-0.992693,0.000579,-0.770313,-0.281336,0.397651,-0.69091,0.65867,-1.305903,-1.276804,0.04145,0.051963,0.081866,-0.244435,-1.236412,0.956555,0.385355,-0.041372,0.411631,-1.734237,-1.048468,-0.333824,-1.183653,0.895831,0.276668,1.919426,-1.093664,0.661365,0.506465,-0.454609,-0.37245,-1.479011,-0.086418,0.482822,...,1.516699,0.228721,-0.601874,0.489168,0.428162,-2.27757,-2.798545,-1.696672,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Let's create some features which might be useful in the future. 

- create inner product of interests
- create new features concerning difference between 'iid' and 'piid' for 'age', 'imprace', 'imprelig', 'date', 'go_out'. 
- do the same for binary features, goal, career, field, race, from

In [59]:
#creat inner product of interests
col_in=['sports', 'tvsports', 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing', 'reading', 'tv', 'theater', 'movies', 'concerts', 'music', 'shopping', 'yoga']
col_in_x=[i+'_x' for i in col_in]
col_in_y=[i+'_y' for i in col_in]
num_in=len(col_in)
sum_in=0

for i in range(num_in):
    sum_in+=df_new[col_in_x[i]]*df_new[col_in_y[i]]
    df_new['sum_in_z']=sum_in

#calculate difference, iid-piid
#age, imprace, imrelig, date, go_out
df_new['age_d_z']=df_new['age_x']-df_new['age_y']
df_new['imprace_d_z']=df_new['imprace_x']-df_new['imprace_y']
df_new['imprelig_d_z']=df_new['imprelig_x']-df_new['imprelig_y']
df_new['date_d_z']=df_new['date_x']-df_new['date_y']
df_new['go_out_d_z']=df_new['go_out_x']-df_new['go_out_y']

#binary difference
#goal, career, field, race, from
df_new['goal_d_z']=1*(df_new['goal_s_x']==df_new['goal_s_y'])
df_new['career_d_z']=1*(df_new['career_c_x']==df_new['career_c_y'])
df_new['field_d_z']=1*(df_new['field_cd_x']==df_new['field_cd_y'])
df_new['race_d_z']=1*(df_new['race_x']==df_new['race_y'])
df_new['from_d_z']=1*(df_new['from_x']==df_new['from_y'])

In [60]:
#delete field_cd, race, career_c, goal
del df_new['field_cd_x'], df_new['race_x'], df_new['career_c_x'], df_new['goal_x']
del df_new['field_cd_y'], df_new['race_y'], df_new['career_c_y'], df_new['goal_y']

In [63]:
#delete from, undergrad
del df_new['from_x'], df_new['undergra_x']
del df_new['from_y'], df_new['undergra_y']

In [64]:
df_new.head()

Unnamed: 0,iid,gender_x,piid,match_x,age_x,imprace_x,imprelig_x,date_x,go_out_x,sports_x,tvsports_x,exercise_x,dining_x,museums_x,art_x,hiking_x,gaming_x,clubbing_x,reading_x,tv_x,theater_x,movies_x,concerts_x,music_x,shopping_x,yoga_x,attr1_1_x,sinc1_1_x,intel1_1_x,fun1_1_x,amb1_1_x,shar1_1_x,attr2_1_x,sinc2_1_x,intel2_1_x,fun2_1_x,amb2_1_x,shar2_1_x,attr3_1_x,sinc3_1_x,fun3_1_x,intel3_1_x,amb3_1_x,attr_x,sinc_x,intel_x,fun_x,amb_x,shar_x,like_x,...,f3social_sci_y,f4medical_sci_y,f5engineering_y,f6english_y,f7hist_y,f8business_y,f9edu_y,f10bio_sci_y,f11social_work_y,f12undergrad_y,f13poli_sci_y,f14film_y,f15arts_y,f16languages_y,f17architecture_y,f18other_y,r1black_y,r2european_y,r3latino_y,r4asian_y,r6other_y,c1lawyer_y,c2academic_y,c3psycho_y,c4doctor_y,c5engineer_y,c6entertainment_y,c7banking_y,c8real_estate_y,c9inter_affairs_y,c10undeci_y,c11social_work_y,c12speech_y,c13politics_y,c14sports_y,c15other_y,c16journalism_y,c17architecture_y,goal_s_y,sum_in_z,age_d_z,imprace_d_z,imprelig_d_z,date_d_z,go_out_d_z,goal_d_z,career_d_z,field_d_z,race_d_z,from_d_z
0,11,1,1,0,0.16801,1.112494,-0.246625,0.000811,1.628324,0.618207,0.868264,-1.753154,-0.992693,0.000579,-0.770313,-0.281336,0.397651,-0.69091,0.65867,-1.305903,-1.276804,0.04145,0.051963,0.081866,-0.244435,-1.236412,0.956555,0.385355,-0.041372,0.411631,-1.734237,-1.048468,-0.333824,-1.183653,0.895831,0.276668,1.919426,-1.093664,0.661365,0.506465,-0.454609,-0.37245,-1.479011,-0.086418,0.482822,0.428537,0.862506,0.700388,0.294916,0.490433,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.071589,1.67202,1.748867,-0.351741,-1.369453,2.679321,1,0,0,0,0
1,11,1,2,0,0.16801,1.112494,-0.246625,0.000811,1.628324,0.618207,0.868264,-1.753154,-0.992693,0.000579,-0.770313,-0.281336,0.397651,-0.69091,0.65867,-1.305903,-1.276804,0.04145,0.051963,0.081866,-0.244435,-1.236412,0.956555,0.385355,-0.041372,0.411631,-1.734237,-1.048468,-0.333824,-1.183653,0.895831,0.276668,1.919426,-1.093664,0.661365,0.506465,-0.454609,-0.37245,-1.479011,0.953656,-0.102563,-0.872202,1.38957,0.122142,-0.702463,0.490433,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.254698,0.840611,1.748867,-0.707999,0.016455,2.679321,1,0,0,1,0
2,11,1,3,0,0.16801,1.112494,-0.246625,0.000811,1.628324,0.618207,0.868264,-1.753154,-0.992693,0.000579,-0.770313,-0.281336,0.397651,-0.69091,0.65867,-1.305903,-1.276804,0.04145,0.051963,0.081866,-0.244435,-1.236412,0.956555,0.385355,-0.041372,0.411631,-1.734237,-1.048468,-0.333824,-1.183653,0.895831,0.276668,1.919426,-1.093664,0.661365,0.506465,-0.454609,-0.37245,-1.479011,0.433619,0.482822,-0.872202,-0.718686,0.700388,-0.702463,-0.621637,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,-3.146662,0.563475,-0.376827,-0.351741,1.402363,2.679321,1,0,0,1,0
3,11,1,4,0,0.16801,1.112494,-0.246625,0.000811,1.628324,0.618207,0.868264,-1.753154,-0.992693,0.000579,-0.770313,-0.281336,0.397651,-0.69091,0.65867,-1.305903,-1.276804,0.04145,0.051963,0.081866,-0.244435,-1.236412,0.956555,0.385355,-0.041372,0.411631,-1.734237,-1.048468,-0.333824,-1.183653,0.895831,0.276668,1.919426,-1.093664,0.661365,0.506465,-0.454609,-0.37245,-1.479011,-0.086418,-0.102563,0.428537,0.335442,0.122142,-0.203774,-0.065602,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-5.794991,1.117747,2.103149,0.717034,0.016455,2.679321,1,0,0,1,0
4,11,1,5,0,0.16801,1.112494,-0.246625,0.000811,1.628324,0.618207,0.868264,-1.753154,-0.992693,0.000579,-0.770313,-0.281336,0.397651,-0.69091,0.65867,-1.305903,-1.276804,0.04145,0.051963,0.081866,-0.244435,-1.236412,0.956555,0.385355,-0.041372,0.411631,-1.734237,-1.048468,-0.333824,-1.183653,0.895831,0.276668,1.919426,-1.093664,0.661365,0.506465,-0.454609,-0.37245,-1.479011,-0.086418,0.482822,0.428537,0.862506,0.122142,0.294916,-0.065602,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-2.167924,1.67202,-0.376827,0.717034,0.709409,2.679321,1,0,0,1,0


In [65]:
dftrain=df_new[mask]
dftest=df_new[~mask]