# Import Pandas and Rename the Columns 

In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt

benchmark_data = pd.read_csv('sklearn-benchmark-data.tsv.gz', sep='\t')
benchmark_data.head()


In [5]:
benchmark_data.rename(columns={'tae':'Dataset_Name',
                               'LinearSVC':'Method_Name',
                               'C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1':'Parameters',
                               '0.421052631579':'Test_Score'},inplace=True)
benchmark_data.head()

Unnamed: 0,Dataset_Name,Method_Name,Parameters,Test_Score
0,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.578947
1,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.5
2,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.473684
3,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.5
4,tae,LinearSVC,"C=0.01,loss=hinge,penalty=l2,dual=True,tol=0.1",0.578947


# Preliminary Analysis

In [None]:
benchmark_data.describe()

In [None]:
benchmark_data.shape

# List all the methods so as to divide the dataset into multiple dataframes


In [None]:
names_list=benchmark_data['Dataset_Name'].unique().tolist()
methods_list=benchmark_data['Method_Name'].unique().tolist()
names_list
methods_list


In [None]:
len(methods_list)

In [12]:
#split the dataframe into multiple dataframes by the method name
methods_list=benchmark_data['Method_Name'].unique().tolist()
MethodWiseData={}
for method_name in methods_list:
    MethodWiseData[method_name] = benchmark_data[benchmark_data.Method_Name==method_name]
    

# Make a folder to save the file

In [14]:
# make a folder and save a file
import os
if not os.path.isdir('Benchmark_Results_18thApril2016'):
    os.mkdir('Benchmark_Results_18thApril2016') 

MethodWiseData['LogisticRegression'].to_pickle('Benchmark_Results_18thApril2016/LogisticRegression.tsv.gz')


In [17]:
# Read the file
Method_Type = pd.read_pickle('Benchmark_Results_18thApril2016/LogisticRegression.tsv.gz')
print (Method_Type)
Method_Type.head()

         Dataset_Name         Method_Name          Parameters  Test_Score
342419           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342420           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342421           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342422           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342423           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342424           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342425           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342426           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342427           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342428           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342429           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342430           wine  LogisticRegression   C=0.01,penalty=l1    0.333333
342431           wine  LogisticRegress

Unnamed: 0,Dataset_Name,Method_Name,Parameters,Test_Score
342419,wine,LogisticRegression,"C=0.01,penalty=l1",0.333333
342420,wine,LogisticRegression,"C=0.01,penalty=l1",0.333333
342421,wine,LogisticRegression,"C=0.01,penalty=l1",0.333333
342422,wine,LogisticRegression,"C=0.01,penalty=l1",0.333333
342423,wine,LogisticRegression,"C=0.01,penalty=l1",0.333333


Note: To run the code for all parameters, it is required to change in the above section the method name appropriately. Also below the number of parameters have to be inputted approprietly and the parameter name has to be changed as well to achieve the data cleaning for all methods.

# Split the Parameters columns (Every method has different number of parameters)

In [18]:
Param_Split = pd.DataFrame(Method_Type.Parameters.str.split(',').tolist(),
                                   columns = ['Param1','Param2'])
#Param_Split

In [19]:
Method_Type1 = Method_Type.drop('Parameters', 1)    #delete the Parameters column from the original dataframe
index = Param_Split.index.get_values()              #get the index of the parameter dataframe  
Method_Type2 = Method_Type1.set_index(index)          #set the index of method dataframe same as parameter dataframe
result = pd.concat([Method_Type2, Param_Split], axis = 1)    #finally add the parameter columns to get the result (desired format)
#result

In [20]:
# Split the Parameter Column to process it further and have only numeric values in it
data_split1 = pd.DataFrame(result.Param1.str.split('=').tolist(),
                                   columns = ['Param_Name','C'])
data_split2 = pd.DataFrame(result.Param2.str.split('=').tolist(),
                                   columns = ['Param_Name','penalty'])
#data_split3 = pd.DataFrame(result.Param3.str.split('=').tolist(),
#                                   columns = ['Param_Name','criterion'])
#data_split4 = pd.DataFrame(result.Param4.str.split('=').tolist(),
#                                   columns = ['Param_Name','kernel'])
#data_split5 = pd.DataFrame(result.Param5.str.split('=').tolist(),
#                                   columns = ['Param_Name','degree'])
#data_split6 = pd.DataFrame(result.Param5.str.split('=').tolist(),
#                                   columns = ['Param_Name','warm_start'])

# Delete the Parameters column from the original dataframe
method_data1 = result.drop('Param1',1)  
method_data1 = method_data1.drop('Param2',1)
#method_data1 = method_data1.drop('Param3',1)
#method_data1 = method_data1.drop('Param4',1)
#method_data1 = method_data1.drop('Param5',1)
#method_data1 = method_data1.drop('Param6',1)

data_split1 = data_split1.drop('Param_Name',1)
data_split2 = data_split2.drop('Param_Name',1)
#data_split3 = data_split3.drop('Param_Name',1)
#data_split4 = data_split4.drop('Param_Name',1)
#data_split5 = data_split5.drop('Param_Name',1)
#data_split6 = data_split6.drop('Param_Name',1)

idx = data_split1.index.get_values()               #get the index of the parameter dataframe  
method_data2 = method_data1.set_index(idx)         #set the index of method dataframe same as parameter dataframe
cleaned_data = pd.concat([method_data2, data_split1, data_split2], axis = 1)
#finally add the parameter columns to get the result (desired format)

# You must cast the data as a float type -- it was parsed into a string type
#cleaned_data['n_estimators'] = cleaned_data['n_estimators'].astype(float)
#cleaned_data['learning_rate'] = cleaned_data['learning_rate'].astype(float)
#cleaned_data['warm_start'] = cleaned_data['warm_start'].astype(float)
#cleaned_data['criterion'] = cleaned_data['criterion'].astype(float)
#cleaned_data['warm_start'] = cleaned_data['warm_start'].astype(float)

#cleaned_data

# Save this file of a method and parameters organized in columns


In [21]:
import os
if not os.path.isdir('Cleaned_Method_Wise_Data_18thApril2016'):
    os.mkdir('Cleaned_Method_Wise_Data_18thApril2016')
cleaned_data.to_pickle('Cleaned_Method_Wise_Data_18thApril2016/LogisticRegression_cleaned.tsv.gz')

In [23]:
# Read the file
Cleaned_Data = pd.read_pickle('Cleaned_Method_Wise_Data_18thApril2016/LogisticRegression_cleaned.tsv.gz')
Cleaned_Data

Unnamed: 0,Dataset_Name,Method_Name,Test_Score,C,penalty
0,wine,LogisticRegression,0.333333,0.01,l1
1,wine,LogisticRegression,0.333333,0.01,l1
2,wine,LogisticRegression,0.333333,0.01,l1
3,wine,LogisticRegression,0.333333,0.01,l1
4,wine,LogisticRegression,0.333333,0.01,l1
5,wine,LogisticRegression,0.333333,0.01,l1
6,wine,LogisticRegression,0.333333,0.01,l1
7,wine,LogisticRegression,0.333333,0.01,l1
8,wine,LogisticRegression,0.333333,0.01,l1
9,wine,LogisticRegression,0.333333,0.01,l1
