# Datathon

## Pablo Lázaro Herrasti y Manuel Jesús Galán Moreu

### Imports

In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

### SKLEARN

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC ### Support Vector Machine
from sklearn.ensemble import RandomForestClassifier ### Random forest

### Reading data

In [2]:
dir_data = 'C:/Users/Pablo.lazaro.herras1/Documents/Datathon/Datasets/'
elephant_dir = 'elephant.csv'
ring_dir = 'ring.csv'
yeast1_dir = 'yeast1.csv'

In [3]:
df_elephant = pd.read_csv(dir_data + elephant_dir, sep=';', skipinitialspace=True)
df_ring = pd.read_csv(dir_data + ring_dir, sep=';', skipinitialspace=True)
df_yeast1 = pd.read_csv(dir_data + yeast1_dir, sep=';', skipinitialspace=True)

### Showing data

In [4]:
print('Number of rows: ' + str(len(df_elephant)))
print('Number of columns: ' + str(len(df_elephant.columns)))
df_elephant.head()

Number of rows: 1391
Number of columns: 231


Unnamed: 0,Atr-1,Atr-2,Atr-3,Atr-4,Atr-5,Atr-6,Atr-7,Atr-8,Atr-9,Atr-10,...,Atr-222,Atr-223,Atr-224,Atr-225,Atr-226,Atr-227,Atr-228,Atr-229,Atr-230,CLASS
0,-28698.0,-624297.0,-679333.0,-455715.0,371213,-415471.0,-549112,-4722.0,363383,33419,...,0,-49855,120.049,-78862,-21452,0,0,-14952,-21097,1
1,363763.0,146879.0,-83422.0,819986.0,504474,167.231,-774634,76555.0,355781,215197,...,0,-49855,129.994,-78862,-21452,0,0,-14952,-21097,1
2,-15592.0,-139025.0,-115.844,246.346,-463596,-138.827,-945256,260.892,-49767,846782,...,0,-49855,31608.0,182763,-21452,0,0,-14952,-21097,1
3,-122.972,-119.093,-364069.0,-132.571,-5645,-740226.0,-468026,-131.328,-82819,857917,...,0,-49855,591019.0,-78862,-21452,0,0,-14952,-21097,1
4,9162.0,265188.0,-252377.0,256.115,-128087,-105623.0,-302654,259.287,-167432,-626084,...,0,-49855,-114025.0,167061,-21452,0,0,-14952,-21097,1


In [13]:
print('Number of rows: ' + str(len(df_ring)))
print('Number of columns: ' + str(len(df_ring.columns)))

df_ring.head()

Number of rows: 7400
Number of columns: 21


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A12,A13,A14,A15,A16,A17,A18,A19,A20,Class
0,849,2177,598,1689,3114,-3406,3799,-2642,1578,-181,...,1633,4664,1081,-1172,-166,578,1664,309,-3028,0
1,947,543,782,-449,-8,1316,756,-61,1604,1686,...,-382,1259,608,-2292,1008,2687,-1422,-606,2374,1
2,759,-314,898,-1743,2003,1117,911,136,-489,-144,...,112,598,474,2026,1431,-336,79,1123,302,1
3,531,1374,517,1226,1122,-4,-1227,2277,1083,888,...,1063,2454,587,-744,1216,916,-476,6,-477,1
4,-1443,1065,2071,207,-865,-786,2655,-366,971,-341,...,4195,-117,-2694,14,4097,1356,-944,-602,2348,0


In [14]:
print('Number of rows: ' + str(len(df_yeast1)))
print('Number of columns: ' + str(len(df_yeast1.columns)))
df_yeast1.head()

Number of rows: 1484
Number of columns: 9


Unnamed: 0,Mcg,Gvh,Alm,Mit,Erl,Pox,Vac,Nuc,Class
0,58,61,47,13,5,0,48,22,negative
1,43,67,48,27,5,0,53,22,negative
2,64,62,49,15,5,0,53,22,negative
3,58,44,57,13,5,0,54,22,positive
4,42,44,48,54,5,0,48,22,negative


### First cleaning

#### Funciones útiles

In [15]:
def obtain_labels(df):
    
    if len(df.columns) == 231 or len(df.columns) == 21:
        return list(df.iloc[:,-1])
    else:
        return [0 if label == 'negative' else 1 for label in list(df.iloc[:,-1])]

In [16]:
def preprocess_dataframe(df):
    
    eliminate_columns = []
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = pd.to_numeric(df[column].str.replace(',','.'), errors='coerce')
        if (df[column] == 0).all() or len(df[column].unique()) == 1:
            eliminate_columns.append(column)
        
    return df.drop(eliminate_columns, axis=1), eliminate_columns

In [26]:
data_elephant, eliminate_elephant = preprocess_dataframe(df_elephant.iloc[:,:-1])
data_ring, eliminate_ring = preprocess_dataframe(df_ring.iloc[:,:-1])
data_yeast1, eliminate_yeast1 = preprocess_dataframe(df_yeast1.iloc[:,:-1])
labels_elephant = obtain_labels(df_elephant)
labels_ring = obtain_labels(df_ring)
labels_yeast1 = obtain_labels(df_yeast1)

### Showing data preprocessed

In [18]:
print('Number of columns: ' + str(len(data_elephant.columns)))
data_elephant.head()

Number of columns: 110


Unnamed: 0,Atr-1,Atr-2,Atr-3,Atr-4,Atr-5,Atr-6,Atr-7,Atr-8,Atr-9,Atr-10,...,Atr-211,Atr-215,Atr-216,Atr-217,Atr-218,Atr-219,Atr-223,Atr-224,Atr-225,Atr-226
0,-0.28698,-0.624297,-0.679333,-0.455715,0.371213,-0.415471,-0.549112,-0.4722,0.363383,0.33419,...,-0.037709,-0.242745,-0.173706,-0.308654,165.218,-0.017565,-0.049855,120.049,-0.078862,-0.021452
1,0.363763,0.146879,-0.83422,0.819986,0.504474,167.231,-0.774634,0.76555,0.355781,0.215197,...,-0.037709,-0.265739,-0.149831,-0.321915,0.817139,-0.017565,-0.049855,129.994,-0.078862,-0.021452
2,-0.15592,-0.139025,-115.844,246.346,-0.463596,-138.827,-0.945256,260.892,-0.049767,0.846782,...,-0.037709,-0.265739,0.502075,-0.399635,-0.102856,-0.017565,-0.049855,0.031608,0.182763,-0.021452
3,-122.972,-119.093,-0.364069,-132.571,-0.005645,-0.740226,-0.468026,-131.328,-0.082819,0.857917,...,-0.037709,-0.265739,-0.337459,-0.311496,11.019,-0.017565,-0.049855,0.591019,-0.078862,-0.021452
4,0.9162,0.265188,-0.252377,256.115,-0.128087,-0.105623,-0.302654,259.287,-0.167432,-0.626084,...,-0.037709,-0.265739,133.835,-0.378855,-0.102856,-0.017565,-0.049855,-0.114025,0.167061,-0.021452


In [19]:
print('Number of columns: ' + str(len(data_ring.columns)))
data_ring.head()

Number of columns: 20


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,A17,A18,A19,A20
0,849,2177,598,1689,3114,-3406,3799,-2642,1578,-181,-212,1633,4664,1081,-1172,-166,578,1664,309,-3028
1,947,543,782,-449,-8,1316,756,-61,1604,1686,264,-382,1259,608,-2292,1008,2687,-1422,-606,2374
2,759,-314,898,-1743,2003,1117,911,136,-489,-144,982,112,598,474,2026,1431,-336,79,1123,302
3,531,1374,517,1226,1122,-4,-1227,2277,1083,888,-111,1063,2454,587,-744,1216,916,-476,6,-477
4,-1443,1065,2071,207,-865,-786,2655,-366,971,-341,221,4195,-117,-2694,14,4097,1356,-944,-602,2348


In [20]:
print('Number of columns: ' + str(len(data_yeast1.columns)))
data_yeast1.head()

Number of columns: 8


Unnamed: 0,Mcg,Gvh,Alm,Mit,Erl,Pox,Vac,Nuc
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22


### Correlation data

Función que crea un diccionario con key: atributo y value: atributos más parecidos y su porcentaje de correlación en una tupla

In [53]:
def calculate_corr(df):
    return df.corr() 

In [54]:
def topn_corr(df, top_n):
    
    top_n = top_n+1
    correlation = calculate_corr(df)
    all_corr = {}
    for column in correlation.columns:
        top = correlation[column].nlargest(top_n)
        indexes = list((top).index)[1:]
        all_corr[column] = (indexes, list(top)[1:])
    
    return all_corr

In [55]:
corr_elephant = calculate_corr(data_elephant)
corr_ring = calculate_corr(data_ring)
corr_yeast1 = calculate_corr(data_yeast1)

In [56]:
def eliminate_corr_features(df, corr, threshold):
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= threshold:
                if columns[j]:
                    columns[j] = False
    selected_columns = df.columns[columns]
    df = df[selected_columns]
    
    return df

In [57]:
data_elephant_nocorr = eliminate_corr_features(data_elephant, corr_elephant, 0.8)
data_ring_nocorr = eliminate_corr_features(data_ring, corr_ring, 0.8)
data_yeast1_nocorr = eliminate_corr_features(data_yeast1, corr_yeast1, 0.8)

### Data split

In [58]:
X1 = data_elephant_nocorr
X2 = data_ring_nocorr
X3 = data_yeast1_nocorr
Y1 = labels_elephant
Y2 = labels_ring
Y3 = labels_yeast1

In [59]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,Y1,test_size=0.2, random_state=30, stratify=Y1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,Y2,test_size=0.2, random_state=30, stratify=Y2)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3,Y3,test_size=0.2, random_state=30, stratify=Y3)

### Support Vector Machine

In [45]:
steps = [('scaler', StandardScaler()), ('SVM', SVC())]
pipeline = Pipeline(steps)

In [60]:
parameters = {'SVM__C':[0.001,0.1,10,100,10e5], 'SVM__gamma':[0.1,0.01]}
grid = GridSearchCV(pipeline, param_grid=parameters, cv=5)

In [61]:
grid.fit(X_train1, y_train1)
print("Elephant score = %3.2f" %(grid.score(X_test1,y_test1)))
print(grid.best_params_)

Elephant score = 0.75
{'SVM__C': 10, 'SVM__gamma': 0.01}


In [62]:
grid.fit(X_train2, y_train2)
print("Ring score = %3.2f" %(grid.score(X_test2,y_test2)))
print(grid.best_params_)

Ring score = 0.98
{'SVM__C': 0.1, 'SVM__gamma': 0.1}


In [63]:
grid.fit(X_train3, y_train3)
print("Yeast1 score = %3.2f" %(grid.score(X_test3,y_test3)))
print(grid.best_params_)

KeyboardInterrupt: 

### Random forest

In [68]:
steps = [('scaler', StandardScaler()), ('clf', RandomForestClassifier())]
pipeline = Pipeline(steps)

In [82]:
parameters = {'clf__n_estimators':[100, 300, 500, 800, 1200],
'clf__max_depth':[5, 8, 15, 25, 30],
'clf__min_samples_split':[2, 5, 10, 15, 100],
'clf__min_samples_leaf':[1, 2, 5, 10] 
}

In [83]:
grid = GridSearchCV(pipeline, param_grid=parameters, cv=5)

In [None]:
grid.fit(X_train1, y_train1)
print("Elephant score = %3.2f" %(grid.score(X_test1,y_test1)))
print(grid.best_params_)

In [None]:
clf.fit(X_train2,y_train2)
y_pred2 = clf.predict(X_test2)
print("Accuracy:",metrics.accuracy_score(y_test2, y_pred2))

In [None]:
clf.fit(X_train3,y_train3)
y_pred3 = clf.predict(X_test3)
print("Accuracy:",metrics.accuracy_score(y_test3, y_pred3))