# Import Pandas and Rename the Columns

In [14]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt

benchmark_data = pd.read_csv('sklearn-benchmark-data.tsv.gz', sep='\t')
benchmark_data.head()

Unnamed: 0,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.421052631579
0,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.578947
1,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.5
2,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.473684
3,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.5
4,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.578947


In [15]:
benchmark_data.rename(columns={'tae':'Dataset_Name',
                               'LinearSVC':'Method_Name',
                               'C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1':'Parameters',
                               '0.421052631579':'Test_Score'},inplace=True)
benchmark_data.head()

Unnamed: 0,Dataset_Name,Method_Name,Parameters,Test_Score
0,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.578947
1,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.5
2,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.473684
3,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.5
4,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.578947


# Preliminary Analysis

In [4]:
benchmark_data.describe()

Unnamed: 0,Test_Score
count,27471660.0
mean,0.725256
std,0.229029
min,0.0
25%,0.575658
50%,0.769231
75%,0.920455
max,1.0


In [4]:
benchmark_data.shape

(27471660, 4)

# List all the methods so as to divide the dataset into multiple dataframes

In [16]:
names_list=benchmark_data['Dataset_Name'].unique().tolist()
methods_list=benchmark_data['Method_Name'].unique().tolist()
names_list
methods_list

['LinearSVC',
 'ExtraTreesClassifier',
 'SVC',
 'DecisionTreeClassifier',
 'GradientBoostingClassifier',
 'AdaBoostClassifier',
 'KNeighborsClassifier',
 'MultinomialNB',
 'XGBClassifier',
 'LogisticRegression',
 'SGDClassifier',
 'RandomForestClassifier']

In [6]:
len(methods_list)

12

In [17]:
#split the dataframe into multiple dataframes by the method name
methods_list=benchmark_data['Method_Name'].unique().tolist()
MethodWiseData={}
for method_name in methods_list:
    MethodWiseData[method_name] = benchmark_data[benchmark_data.Method_Name==method_name]

# Make a folder to save the file

In [18]:
# make a folder and save a file
import os
if not os.path.isdir('Benchmark_Results_6thMay2016'):
    os.mkdir('Benchmark_Results_6thMay2016') 

MethodWiseData['LinearSVC'].to_pickle('Benchmark_Results_6thMay2016/LinearSVC.tsv.gz')

In [19]:
# Read the file
Method_Type = pd.read_pickle('Benchmark_Results_6thMay2016/LinearSVC.tsv.gz')
print (Method_Type)
Method_Type.head()

            Dataset_Name Method_Name  \
0                    tae   LinearSVC   
1                    tae   LinearSVC   
2                    tae   LinearSVC   
3                    tae   LinearSVC   
4                    tae   LinearSVC   
5                    tae   LinearSVC   
6                    tae   LinearSVC   
7                    tae   LinearSVC   
8                    tae   LinearSVC   
9                    tae   LinearSVC   
10                   tae   LinearSVC   
11                   tae   LinearSVC   
12                   tae   LinearSVC   
13                   tae   LinearSVC   
14                   tae   LinearSVC   
15                   tae   LinearSVC   
16                   tae   LinearSVC   
17                   tae   LinearSVC   
18                   tae   LinearSVC   
19                   tae   LinearSVC   
20                   tae   LinearSVC   
21                   tae   LinearSVC   
22                   tae   LinearSVC   
23                   tae   LinearSVC   


Unnamed: 0,Dataset_Name,Method_Name,Parameters,Test_Score
0,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.578947
1,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.5
2,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.473684
3,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.5
4,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.578947


Note: To run the code for all parameters, it is required to change in the above section the method name appropriately. Also below the number of parameters have to be inputted approprietly and the parameter name has to be changed as well to achieve the data cleaning for all methods

# Split the Parameters columns (Every method has different number of parameters)

In [20]:
Param_Split = pd.DataFrame(Method_Type.Parameters.str.split(',').tolist(),
                                   columns = ['Param1','Param2', 'Param3', 'Param4', 'Param5'])
Param_Split.head

<bound method NDFrame.head of          Param1              Param2      Param3      Param4     Param5
0        C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
1        C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
2        C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
3        C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
4        C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
5        C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
6        C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
7        C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
8        C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
9        C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
10       C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
11       C=0.01          loss=hinge  penalty=l2   dual=True    tol=0.1
12       C=0.01          loss=hinge  penalty=l2

In [21]:
Method_Type1 = Method_Type.drop('Parameters', 1)    #delete the Parameters column from the original dataframe
index = Param_Split.index.get_values()              #get the index of the parameter dataframe  
Method_Type2 = Method_Type1.set_index(index)          #set the index of method dataframe same as parameter dataframe
result = pd.concat([Method_Type2, Param_Split], axis = 1)    #finally add the parameter columns to get the result (desired format)
result.head()

Unnamed: 0,Dataset_Name,Method_Name,Test_Score,Param1,Param2,Param3,Param4,Param5
0,tae,LinearSVC,0.578947,C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1
1,tae,LinearSVC,0.5,C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1
2,tae,LinearSVC,0.473684,C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1
3,tae,LinearSVC,0.5,C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1
4,tae,LinearSVC,0.578947,C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1


In [22]:
# Split the Parameter Column to process it further and have only numeric values in it
data_split1 = pd.DataFrame(result.Param1.str.split('=').tolist(),
                                   columns = ['Param_Name','C'])
data_split2 = pd.DataFrame(result.Param2.str.split('=').tolist(),
                                   columns = ['Param_Name','loss'])
data_split3 = pd.DataFrame(result.Param3.str.split('=').tolist(),
                                   columns = ['Param_Name','penalty'])
data_split4 = pd.DataFrame(result.Param4.str.split('=').tolist(),
                                   columns = ['Param_Name','dual'])
data_split5 = pd.DataFrame(result.Param5.str.split('=').tolist(),
                                   columns = ['Param_Name','tol'])
#data_split6 = pd.DataFrame(result.Param5.str.split('=').tolist(),
#                                   columns = ['Param_Name','warm_start'])

# Delete the Parameters column from the original dataframe
method_data1 = result.drop('Param1',1)  
method_data1 = method_data1.drop('Param2',1)
method_data1 = method_data1.drop('Param3',1)
method_data1 = method_data1.drop('Param4',1)
method_data1 = method_data1.drop('Param5',1)
#method_data1 = method_data1.drop('Param6',1)

data_split1 = data_split1.drop('Param_Name',1)
data_split2 = data_split2.drop('Param_Name',1)
data_split3 = data_split3.drop('Param_Name',1)
data_split4 = data_split4.drop('Param_Name',1)
data_split5 = data_split5.drop('Param_Name',1)
#data_split6 = data_split6.drop('Param_Name',1)

idx = data_split1.index.get_values()               #get the index of the parameter dataframe  
method_data2 = method_data1.set_index(idx)         #set the index of method dataframe same as parameter dataframe
cleaned_data = pd.concat([method_data2, data_split1, data_split2, data_split3, data_split4, data_split5], axis = 1)
#finally add the parameter columns to get the result (desired format)

# You must cast the data as a float type -- it was parsed into a string type
cleaned_data['C'] = cleaned_data['C'].astype(float)
cleaned_data['tol'] = cleaned_data['tol'].astype(float)
#cleaned_data['warm_start'] = cleaned_data['warm_start'].astype(float)
#cleaned_data['criterion'] = cleaned_data['criterion'].astype(float)
#cleaned_data['warm_start'] = cleaned_data['warm_start'].astype(float)

cleaned_data.head()

Unnamed: 0,Dataset_Name,Method_Name,Test_Score,C,loss,penalty,dual,tol
0,tae,LinearSVC,0.578947,0.01,hinge,l2,True,0.1
1,tae,LinearSVC,0.5,0.01,hinge,l2,True,0.1
2,tae,LinearSVC,0.473684,0.01,hinge,l2,True,0.1
3,tae,LinearSVC,0.5,0.01,hinge,l2,True,0.1
4,tae,LinearSVC,0.578947,0.01,hinge,l2,True,0.1


# Save this file of a method and parameters organized in columns

In [23]:
import os
if not os.path.isdir('Cleaned_Method_Wise_Data_6thMay2016'):
    os.mkdir('Cleaned_Method_Wise_Data_6thMay2016')
cleaned_data.to_pickle('Cleaned_Method_Wise_Data_6thMay2016/LinearSVC_cleaned.tsv.gz')

In [24]:
# Read the file
Cleaned_Data = pd.read_pickle('Cleaned_Method_Wise_Data_6thMay2016/LinearSVC_cleaned.tsv.gz')
Cleaned_Data.head()

Unnamed: 0,Dataset_Name,Method_Name,Test_Score,C,loss,penalty,dual,tol
0,tae,LinearSVC,0.578947,0.01,hinge,l2,True,0.1
1,tae,LinearSVC,0.5,0.01,hinge,l2,True,0.1
2,tae,LinearSVC,0.473684,0.01,hinge,l2,True,0.1
3,tae,LinearSVC,0.5,0.01,hinge,l2,True,0.1
4,tae,LinearSVC,0.578947,0.01,hinge,l2,True,0.1
