In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# Data

In [None]:
df = pd.read_csv('../input/prostate-cancer/Prostate_Cancer.csv')

In [None]:
df

In [None]:
df.diagnosis_result.unique()

## Encodage de la target

In [None]:
def encodage(df):
    code = { 'M': 0,'B': 1}
    for col in df.select_dtypes('object'):
        df[col] = df[col].map(code)
    
    return df

In [None]:
encodage(df)

In [None]:
#df.shape

### On drop la colonne id qui n'est pas nécessaire

In [None]:
df = df.drop(['id'], axis=1)

## Heatmap corrélation

In [None]:
# Cluster map avec colinéarité 

sns.clustermap(df.corr(),annot=True)

# Model KNN


## Sans sélection de feature

### On définis les features et la cible

In [None]:
y = df['diagnosis_result']
# je garde toute les colonnes sauf Purchased (target)
X = df.drop(['diagnosis_result'], axis=1)

### On sépare les données d'entrainement et de test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print('Train set:', X_train.shape)
print('Test set:', X_test.shape)

### On standardise nos données

In [None]:
scaler = MinMaxScaler()  
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### On fait un grid search

In [None]:
parameters = {'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute'), 
              'weights':('uniform', 'distance'), 
              'metric': ('minkowski', 'euclidean', 'manhattan' )}
#param_grid = {'n_neighbors': np.arange(1, 25)}

grid = GridSearchCV(KNeighborsClassifier(), parameters, cv=5)
grid.fit(X_train,y_train)

### KNN classificateur model avec optimisation des hyperparamètres

In [None]:
grid.best_estimator_

In [None]:
model = KNeighborsClassifier( algorithm= 'auto',
                             n_neighbors=5,leaf_size=30, 
                             metric='minkowski', 
                             metric_params=None, 
                             n_jobs=None, 
                             p=2, 
                             weights='uniform')

In [None]:
model.fit(X_train, y_train)
print('train score:', model.score(X_train, y_train))
print('test score:', model.score(X_test, y_test))

In [None]:
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
cv_scores = cross_val_score(model, X, y, cv=5)
#print each cv score (accuracy) and average them
cv_scores
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

## Sélection de features

### On check la colinéarité avec statmodels

In [None]:
import statsmodels.api as sm 

In [None]:
# trouver les valeurs des paramètres qui maximisent la fonction de vraisemblance
import statsmodels.formula.api as smf
result = smf.logit("diagnosis_result ~ perimeter + area + compactness", data = df).fit()
result.summary()

### On check la colinéarité

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor    

def calculate_vif_(X, thresh=100):
    cols = X.columns
    variables = np.arange(X.shape[1])
    dropped=True
    while dropped:
        dropped=False
        c = X[cols[variables]].values
        vif = [variance_inflation_factor(c, ix) for ix in np.arange(c.shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('Supprime cette feature \'' + X[cols[variables]].columns[maxloc] + '\' at index: ' + str(maxloc))
            variables = np.delete(variables, maxloc)
            dropped=True

    print('Garde ces variables:')
    print(X.columns[variables])
    return X[cols[variables]]

In [None]:
df_final = calculate_vif_(X, thresh=100)
#df_final = df_final.drop(['id'], axis=1)

### On redéfini notre X avec les features sélectionnées

In [None]:
y = df['diagnosis_result']
# je garde toute les colonnes sauf Purchased (target)
X_final = df_final

In [None]:
X_train_final, X_test_final, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=0)

print('Train set:', X_train_final.shape)
print('Test set:', X_test_final.shape)

In [None]:
scaler = MinMaxScaler()  
X_train_final = scaler.fit_transform(X_train_final)
X_test_final = scaler.transform(X_test_final)

In [None]:
model_final = KNeighborsClassifier(algorithm= 'auto',
                             n_neighbors=5,leaf_size=30, 
                             metric='minkowski', 
                             metric_params=None, 
                             n_jobs=None, 
                             p=2, 
                             weights='uniform')

In [None]:
model_final.fit(X_train_final, y_train)
print('train score:', model_final.score(X_train_final, y_train))
print('test score:', model_final.score(X_test_final, y_test))

In [None]:
y_pred_final = model_final.predict(X_test_final)

print(classification_report(y_test, y_pred_final))

In [None]:
plot_confusion_matrix(model_final, X_test_final, y_test)