# A5 - A Machine Learning Competition

# GET DATA

In [1]:
from pandas import read_csv

# CHANGER L'INPUT #
train_df = read_csv("ML-A5-2022_train.csv", index_col=0)
test_df = read_csv("ML-A5-2022_test.csv", index_col=0)

# Prendre toutes les columns de données (1000 row x 34979 col)
X_train = train_df.iloc[:,:-1]
X_test = test_df

# Prendre derniere column label (1 cellule tumeur) ou (-1 cellule en santé)
y_train = train_df['label']

# Prendre les index du testset
labels = X_test.index.to_list()

%store X_train
%store y_train
%store X_test

Stored 'X_train' (DataFrame)
Stored 'y_train' (Series)
Stored 'X_test' (DataFrame)


# PREPROCESS

In [None]:
%store -r train_df
%store -r X
%store -r y

print(X_train.shape)
print(X_test.shape)
print(X_train.head())
print(X_test.head())

In [2]:
from sklearn.preprocessing import OrdinalEncoder

%store -r X_train
%store -r X_test

#Remplacer les NaN par la mediane
X_train.fillna(X_train.median(numeric_only=True), inplace=True)
X_test.fillna(X_test.median(numeric_only=True), inplace=True)

# Remplacer les valeurs texte par des int
transformer = OrdinalEncoder()
X_train = transformer.fit_transform(X_train)

transformer_test = OrdinalEncoder()
X_test = transformer_test.fit_transform(X_test)

%store X_train
%store X_test

Stored 'X_train' (ndarray)
Stored 'X_test' (ndarray)


# FIND BEST PARAMS FOR MODEL

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier 

%store -r X_train
%store -r y_train

loss = ['hinge', 'log', 'modified_huber', 'squared_hinge',  
'perceptron'] 
penalty = ['l1', 'l2', 'elasticnet'] 
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100] 
learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive'] 
eta0 = [1, 10, 100] 
param_distributions = dict(loss=loss, 
                            penalty=penalty, 
                            alpha=alpha, 
                            learning_rate=learning_rate, 
                            eta0=eta0) 

sgd = SGDClassifier(loss="hinge", penalty="l2", max_iter=5) 
random = RandomizedSearchCV(estimator=sgd, 
                            param_distributions=param_distributions, 
                            verbose=1, n_jobs=-1, 
                            n_iter=10, error_score='raise') 


random_result = random.fit(X_train, y_train) 
print('Best Score: ', random_result.best_score_) 
print('Best Params: ', random_result.best_params_) 

# Fitting 5 folds for each of 10 candidates, totalling 50 fits
# Best Score:  0.6852563793860644
# Best Params:  {'penalty': 'elasticnet', 'loss': 'modified_huber', 'learning_rate': 'optimal', 'eta0': 10, 'alpha': 0.0001}

# Fitting 5 folds for each of 100 candidates, totalling 500 fits
# Best Score:  0.696127216694683
# Best Params:  {'penalty': 'l1', 'loss': 'squared_hinge', 'learning_rate': 'optimal', 'eta0': 100, 'alpha': 0.001}

# Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
# Best Score:  0.7007915966259295
# Best Params:  {'penalty': 'l2', 'loss': 'log', 'learning_rate': 'optimal', 'eta0': 1, 'alpha': 100}

# Fitting 5 folds for each of 10 candidates, totalling 50 fits
# Best Score:  0.73
# Best Params:  {'penalty': 'l2', 'loss': 'modified_huber', 'learning_rate': 'invscaling', 'eta0': 1, 'alpha': 0.001} BEST

# Fitting 5 folds for each of 10 candidates, totalling 50 fits
# Best Score:  0.72
# Best Params:  {'penalty': 'l2', 'loss': 'perceptron', 'learning_rate': 'adaptive', 'eta0': 100, 'alpha': 0.01}

# Fitting 5 folds for each of 10 candidates, totalling 50 fits
# Best Score:  0.731
# Best Params:  {'penalty': 'l1', 'loss': 'squared_hinge', 'learning_rate': 'constant', 'eta0': 10, 'alpha': 0.001}

# Fitting 5 folds for each of 480 candidates, totalling 2400 fits
# Best Score:  0.728
# Best Params:  {'alpha': 0.01, 'eta0': 100, 'learning_rate': 'constant', 'loss': 'hinge', 'penalty': 'l2'}

# DEFINIR LE MODELE

In [3]:
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import SGDClassifier 


model = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=100,learning_rate='invscaling',eta0=1,alpha=0.001, random_state=0)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
y_pred = model.predict(X_test)

print((y_pred==1).sum())
print((y_pred==-1).sum())
df = pd.DataFrame({'':labels, 'Prediction':y_pred})
df.to_csv('prediction.csv', index=False)

1.0
103
397


# KFOLD BCR

In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier 
import numpy as np

kf = StratifiedKFold(n_splits=5, shuffle=True,
                        random_state=0)

acc = []
sco = []
for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    print(f"{i}/5 split", end='\r')
    model = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=100,learning_rate='invscaling',eta0=1,alpha=0.001, random_state=0)

    X_traink, X_testk = X_train[train_index, :], X_train[test_index, :]
    y_traink, y_testk = y_train[train_index], y_train[test_index]

    model.fit(X_traink, y_traink)
    predtrain = model.predict(X_traink)
    pred = model.predict(X_testk)
    sco.append(balanced_accuracy_score(y_traink, predtrain))
    ac = balanced_accuracy_score(y_testk, pred)
    acc.append(ac)


print("=== Nested K-Fold Cross-Validation Scores ===")
print("Mean balanced accuracy: "+ str(round(np.mean(acc), 2)))
print("Mean balanced accuracy train: "+ str(round(np.mean(sco), 2)))
print("Std balanced accuracy: "+ str(round(np.std(acc), 2)))
print('=============================================')

# Mean balanced accuracy: 0.64 BEST
# Std balanced accuracy: 0.03

=== Nested K-Fold Cross-Validation Scores ===
Mean balanced accuracy: 0.64
Mean balanced accuracy train: 1.0
Std balanced accuracy: 0.03


# FINAL CODE

In [None]:
from pandas import read_csv
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier 
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

# LOAD INTERESTING DATAS
train_df = read_csv("ML-A5-2022_train.csv", index_col=0)
test_df = read_csv("ML-A5-2022_test.csv", index_col=0)

# Prendre toutes les columns de données (1000 row x 34979 col)
X_train = train_df.iloc[:,:-1]
X_test = test_df

# Prendre derniere column label (1 cellule tumeur) ou (-1 cellule en santé)
y_train = train_df['label']

# Prendre les index du testset
labels = X_test.index.to_list()

# PREPROCESS

#Remplacer les NaN par la mediane
X_train.fillna(X_train.median(numeric_only=True), inplace=True)
X_test.fillna(X_test.median(numeric_only=True), inplace=True)

# Remplacer les valeurs texte par des int
transformer = OrdinalEncoder()
X_train = transformer.fit_transform(X_train)

transformer_test = OrdinalEncoder()
X_test = transformer_test.fit_transform(X_test)

# FIND HYPERPARAMETERS
def get_param():
    loss = ['hinge', 'log', 'modified_huber', 'squared_hinge',  
    'perceptron'] 
    penalty = ['l1', 'l2', 'elasticnet'] 
    alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100] 
    learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive'] 
    eta0 = [1, 10, 100] 
    param_distributions = dict(loss=loss, 
                                penalty=penalty, 
                                alpha=alpha, 
                                learning_rate=learning_rate, 
                                eta0=eta0) 

    sgd = SGDClassifier(loss="hinge", penalty="l2", max_iter=5) 
    random = RandomizedSearchCV(estimator=sgd, 
                                param_distributions=param_distributions, 
                                verbose=1, n_jobs=-1, 
                                n_iter=10, error_score='raise') 


    random_result = random.fit(X_train, y_train) 
    print('Best Score: ', random_result.best_score_) 
    print('Best Params: ', random_result.best_params_) 

# PREDICT TESTSET

model = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=100,learning_rate='invscaling',eta0=1,alpha=0.001, random_state=0)

model.fit(X_train, y_train)
print(model.score(X_train, y_train))

y_pred = model.predict(X_test)

print((y_pred==1).sum())
print((y_pred==-1).sum())

df = pd.DataFrame({'':labels, 'Prediction':y_pred})
df.to_csv('prediction.csv', index=False)


# PERFORM BCR

kf = StratifiedKFold(n_splits=5, shuffle=True,
                        random_state=0)

acc = []
sco = []
for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    print(f"{i}/5 split", end='\r')
    model = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=100,learning_rate='invscaling',eta0=1,alpha=0.001, random_state=0)

    X_traink, X_testk = X_train[train_index, :], X_train[test_index, :]
    y_traink, y_testk = y_train[train_index], y_train[test_index]

    model.fit(X_traink, y_traink)

    predtrain = model.predict(X_traink)
    pred = model.predict(X_testk)

    sco.append(balanced_accuracy_score(y_traink, predtrain))
    ac = balanced_accuracy_score(y_testk, pred)

    acc.append(ac)

print((pred==1).sum())
print((pred==-1).sum())
print("Mean balanced accuracy: "+ str(round(np.mean(acc), 2)))
print("Mean balanced accuracy train: "+ str(round(np.mean(sco), 2)))
print("Std balanced accuracy: "+ str(round(np.std(acc), 2)))