# Notebook en français.

## <p style="font-family:newtimeroman; font-size:200%; text-align:center">Sommaire</p>

* [1. Télechargement des données](#1)
* [2. l'analyse du fichier](#2)
     * [2.1 Forme du fichier](#2.1)
     * [2.2 Déclaration des variables](#2.2)
* [3. Chiffres clés](#3)
    * [3.1 pandas_profiling](#3.1)
    * [3.2 Methode classique](#3.2)
    * [3.3 Variables cibles](#3.3)
    * [3.4 Variables numériques](#3.4)
    * [3.4 Variables caractères](#3.5)    
* [4. Modèlisation](#4)
* [5. Optimisation](#5)
* [6. Fichier soumission](#6)

 
   


In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier,StackingClassifier,VotingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifier
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

<a id='1'></a>
# <p style="font-family:newtimeroman; font-size:150%; text-align:center">1. Télechargement des données </p>

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')
train.set_index("id",inplace=True)
test.set_index("id",inplace=True)

<a id='2'></a>
## <p style=" font-family:newtimeroman; font-size:150%; text-align:center">2. l'analyse du fichier</p>

<a id='2.1'></a>
## <p style=" font-family:newtimeroman; font-size:110%; text-align:center">2.1 Forme du fichier</p>

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.columns

In [None]:
train.dtypes.value_counts().plot.pie(title='Répartition des variables par type')

In [None]:
print('Nombre de valeures manquante Train {}'.format(train.isna().sum().sum()))
print('Nombre de valeures manquante Test  {}'.format(test.isna().sum().sum()))

In [None]:
train.head()

<a id='2.2'></a>
## <p style=" font-family:newtimeroman; font-size:110%; text-align:center">2.2 Déclaration des variables</p>

In [None]:
numeric_columns = train.select_dtypes(['float','int']).columns
Cat_columns=train.select_dtypes('object').columns

<a id='3'></a>
## <p style=" font-family:newtimeroman; font-size:150%; text-align:center">3. Chiffres cles</p>

<a id='3.1'></a>
## <p style=" font-family:newtimeroman; font-size:110%; text-align:center">3.1 pandas_profiling</p>

In [None]:
import pandas_profiling as pp
pp.ProfileReport(train)

<a id='3.2'></a>
## <p style=" font-family:newtimeroman; font-size:110%; text-align:center">3.2 Methode classique</p>

In [None]:
train.describe()

In [None]:
train.describe(include=['O'])

<a id='3.3'></a>
## <p style=" font-family:newtimeroman; font-size:110%; text-align:center">3.3 Variables cibles</p>

In [None]:
train['target'].value_counts(normalize=True)*100 

In [None]:
sns.countplot(x="target", data=train,
                   facecolor=(0, 0, 0, 0),
                   linewidth=5,
                   edgecolor=sns.color_palette("dark", 3))

La variables Target = 26 % 

<a id='3.4'></a>
## <p style=" font-family:newtimeroman; font-size:110%; text-align:center">3.4 Variables numériques</p>

In [None]:
train.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
positive_train = train[train['target'] == 1]
negative_train = train[train['target'] == 0]

In [None]:
for col in train.select_dtypes('float'):
    plt.figure(figsize=(3,3))
    sns.distplot(positive_train[col], label='positive')
    sns.distplot(negative_train[col], label='negative')
    plt.legend()

In [None]:
for col in train.select_dtypes('float') :
    Chiffre = train.groupby('target').agg({
        col : ['median']
    })
    print( f'{col :-<5} {Chiffre} ')


In [None]:
def cp(n, b=220):
    return sns.diverging_palette(1, b, n=n)
mask = np.zeros_like(train[numeric_columns].corr())
mask[np.triu_indices_from(mask)] = True
from pylab import rcParams
rcParams['figure.figsize'] = (12,8)
sns.heatmap(
    train[numeric_columns].corr(),
    cmap = cp(200),
    annot=True,
    mask=mask,
    center = 0,
)

<a id='3.5'></a>
## <p style=" font-family:newtimeroman; font-size:110%; text-align:center">3.5 Variables caractères</p>

In [None]:
for col in train.select_dtypes('object'):
    print(f'{col :-<5} {train[col].nunique():-<5} {train[col].unique()}')

In [None]:
Cinq_modalite_moins = []
Cinq_modalite_plus = []

for col in train.select_dtypes('object'):
    if train[col].nunique() <= 5 :
        Cinq_modalite_moins.append(col)
    else:
        Cinq_modalite_plus.append(col)

In [None]:
for col in Cinq_modalite_moins:
    plt.figure(figsize=(3,3))
    train[col].value_counts().plot.pie()

In [None]:
for col in Cat_columns:
    if set(train[col].unique()) != set(test[col].unique()):
        print(f"La liste des variables avec des modalitées différents entre le test et le train: {col}")

In [None]:
train_cat10 = set(train['cat10'].unique())
test_cat10 = set(test['cat10'].unique())

print(f'Modalités dans le train mais pas dans le test: {train_cat10.difference(test_cat10)}.')
print(f'Modalités dans le test mais pas dans le train: {test_cat10.difference(train_cat10)}.')

In [None]:
del train['cat10']
del test['cat10']

In [None]:
Cat_columns=train.select_dtypes('object').columns

In [None]:
for col in Cinq_modalite_moins:
    plt.figure(figsize=(3,3))
    sns.countplot(x=col, hue="target", data=train,palette="Set3")


<a id='4'></a>
## <p style=" font-family:newtimeroman; font-size:150%; text-align:center">4. Modèlisation</p>

In [None]:
train_alea = train.sample(n=130000,random_state=0)

In [None]:
train.shape

In [None]:
train['target'].value_counts(normalize=True)*100 

In [None]:
trainset, testset = train_test_split(train_alea, test_size=0.2, random_state=0)

In [None]:
print(trainset['target'].value_counts())
print(testset['target'].value_counts())

In [None]:
for col in Cat_columns:
    if set(trainset[col].unique()) != set(testset[col].unique()):
        print(f"La liste des variables avec des modalitées différents entre le test et le train: {col}")

In [None]:
def imputation(df):
    df = df.dropna(axis=0)
    df = df.dropna(axis=1)
    df.drop_duplicates(keep = 'first', inplace=True)
    return  df

In [None]:
def encodage(df):
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(df[Cat_columns])
    df = pd.merge(df[numeric_columns], 
          pd.DataFrame(columns = ohe.get_feature_names().tolist(),
              data = ohe.fit_transform(df[Cat_columns])).set_index(df.index),
        left_index = True, right_index = True)
    return df

In [None]:
def preprocessing(df):
    
    df = encodage(df)
    df = imputation(df)
    
    X = df.drop('target', axis=1)
    y = df['target']
    
    print(y.value_counts(normalize=True))
    
    return X, y

In [None]:
X_train, y_train = preprocessing(trainset)

In [None]:
X_test, y_test = preprocessing(testset)

In [None]:
def evaluation(model):
    
    model.fit(X_train, y_train)
    ypred = model.predict(X_test)
    
    print(confusion_matrix(y_test, ypred))
    print(classification_report(y_test, ypred))
    print(roc_auc_score(y_test, ypred))
    
    #N, train_score, val_score = learning_curve(model, X_train, y_train,
    #                                          cv=2, scoring='accuracy',
    #                                           train_sizes=np.linspace(0.1, 1, 10))
    
    
    #plt.figure(figsize=(12, 8))
    #plt.plot(N, train_score.mean(axis=1), label='train score')
    #plt.plot(N, val_score.mean(axis=1), label='validation score')
    #plt.legend()

In [None]:
preprocessor = make_pipeline(SelectKBest(f_classif, k=10))

In [None]:
RandomForest = make_pipeline(RandomForestClassifier(random_state=0))
AdaBoost = make_pipeline( AdaBoostClassifier(random_state=0))
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))
ridge = make_pipeline(RidgeClassifier(random_state=0))
Bagging = make_pipeline(BaggingClassifier(random_state=0))
Gradient= make_pipeline(GradientBoostingClassifier(random_state=0))

#VotingClassifier =make_pipeline(VotingClassifier(random_state=0))

In [None]:
dict_of_models = {'Bagging' : Bagging,
                  'Gradient' : Gradient,
                  'ridge' : ridge, 
                  'AdaBoost' : AdaBoost,
                  #'SVM': SVM,
                  'RandomForest': RandomForest
                  
                 }

In [None]:
for name, model in dict_of_models.items():
    print(name)
    evaluation(model)

In [None]:
estimators = [
('RandomForest' ,make_pipeline(RandomForestClassifier(random_state=0))),
('AdaBoost' ,make_pipeline( AdaBoostClassifier(random_state=0))),
('ridge' , make_pipeline(RidgeClassifier(random_state=0))),
('Gradient', make_pipeline(GradientBoostingClassifier(random_state=0)))
 ]

In [None]:
#StackingClassifier =make_pipeline(StackingClassifier(random_state=0))
clf = StackingClassifier(estimators=estimators)
clf.fit(X_train, y_train).score(X_test, y_test)

In [None]:
clf.predict()

In [None]:
y_pred = clf.predict(X_test)

print(roc_auc_score(y_test, ypred))


In [None]:
evaluation(clf)

<a id='5'></a>
## <p style=" font-family:newtimeroman; font-size:150%; text-align:center">5. Optimisation</p>

In [None]:
RandomForest.get_params().keys()

In [None]:
hyper_params = {'randomforestclassifier__n_estimators':[1, 5,100,20,30 ],
               'randomforestclassifier__max_depth' : [1,2,3,4,5],
               'randomforestclassifier__n_jobs' : [-1,1]}

In [None]:
grid = RandomizedSearchCV(RandomForest, hyper_params, scoring='accuracy', cv=4,
                          n_iter=10)

grid.fit(X_train, y_train)


In [None]:
print(grid.best_params_)

y_pred = grid.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
evaluation(grid.best_estimator_)

In [None]:
AdaBoost = make_pipeline( AdaBoostClassifier(random_state=0,n_estimators=500,learning_rate=1.3))

In [None]:
evaluation(AdaBoost)

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
hyper_params = {'adaboostclassifier__n_estimators':[1, 50,100,150,500],
               'adaboostclassifier__learning_rate' : [1.1,1.2,1.3,1.4,1.5],
               'adaboostclassifier__algorithm' : ['SAMME','SAMME.R']}

In [None]:
AdaBoost.get_params().keys()

In [None]:
grid = RandomizedSearchCV(AdaBoost, hyper_params, scoring='recall', cv=4,
                          n_iter=10)

grid.fit(X_train, y_train)


In [None]:
RandomForest.get_params().keys()

In [None]:
hyper_params = {'adaboostclassifier__n_estimators':[1, 50,100,150,500],
               'adaboostclassifier__learning_rate' : [1.1,1.2,1.3,1.4,1.5],
               'adaboostclassifier__algorithm' : ['SAMME','SAMME.R']}

In [None]:
grid = RandomizedSearchCV(AdaBoost, hyper_params, scoring='recall', cv=4,
                          n_iter=10)

grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

y_pred = grid.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
evaluation(grid.best_estimator_)

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
precision, recall, threshold = precision_recall_curve(y_test, grid.best_estimator_.decision_function(X_test))

In [None]:
plt.plot(threshold, precision[:-1], label='precision')
plt.plot(threshold, recall[:-1], label='recall')
plt.legend()

In [None]:
def model_final(model, X, threshold=0):
    return model.decision_function(X) > threshold

In [None]:
y_pred = model_final(grid.best_estimator_, X_test, threshold=0.5)

In [None]:
f1_score(y_test, y_pred)

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')

In [None]:
test.set_index('id',inplace=True)

In [None]:
numeric_columns = train.select_dtypes(['float']).columns

In [None]:
test = encodage(test)
test = imputation(test)

In [None]:
ypred = model_final(grid.best_estimator_, test, threshold=0.5)

In [None]:
ypred = pd.DataFrame(data=ypred, columns=['target2'])

In [None]:
ypred['target']= np.where(ypred['target2']==True,1,0)

In [None]:
test.reset_index(inplace=True)

In [None]:
test=pd.merge(test,ypred,how='left',left_index=True,right_index=True)

In [None]:
test['target'].value_counts(normalize=True)

In [None]:
sub = test[['id','target']]

In [None]:
sub.to_csv('submission.csv', index=False)

In [None]:
n_folds = 10
seed_list = [i for i in range(2000, 2022)]

In [None]:
import random
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(seed_list[0])

In [None]:
train_oof_dict = {
    'trans_1': 'train_rgr_epoch2000_probas8_params0_batch512.npy',
    'trans_2': 'train_rgr_epoch2000_probas8_params1_batch512.npy',
    'trans_3': 'train_rgr_epoch2000_probas8_params2_batch512.npy',
    'trans_4': 'train_rgr_epoch2000_probas8_params3_batch512.npy',
    'trans_5': 'train_rgr_epoch2000_probas8_params4_batch512.npy',
    'trans_6': 'train_rgr_epoch2000_probas8_params5_batch512.npy',
    'trans_7': 'train_rgr_epoch2000_probas8_params6_batch512.npy',
    'lightgbm1': 'train_lgb.npy',
    'lightgbm2': 'train_oof_lgbm_0.npy',
    'lightgbm3': 'train_oof_lgbm_1.npy',
    'xgboost': 'train_xgb.npy',
    'catboost': 'train_cbt.npy',
    'logistic_regression1': 'train_lr.npy',
    'logistic_regression2': 'train_oof_lr_0.npy',
    'random_forest': 'train_rf.npy',
    'tabnet1': 'train_tabnet_0.npy',
    'tabnet2': 'train_tabnet_1.npy',
    'histgradient1': 'train_oof_hgb_0.npy',
    'histgradient2': 'train_oof_hgb_1.npy',
    'keras1': 'train_keras_0.npy',
    'keras2': 'train_keras_1.npy'
}

test_pred_dict = {
    'trans_1': 'test_rgr_epoch2000_probas8_params0_batch512.npy',
    'trans_2': 'test_rgr_epoch2000_probas8_params1_batch512.npy',
    'trans_3': 'test_rgr_epoch2000_probas8_params2_batch512.npy',
    'trans_4': 'test_rgr_epoch2000_probas8_params3_batch512.npy',
    'trans_5': 'test_rgr_epoch2000_probas8_params4_batch512.npy',
    'trans_6': 'test_rgr_epoch2000_probas8_params5_batch512.npy',
    'trans_7': 'test_rgr_epoch2000_probas8_params6_batch512.npy',
    'lightgbm1': 'test_lgb.npy',
    'lightgbm2': 'test_preds_lgbm_0.npy',
    'lightgbm3': 'test_preds_lgbm_1.npy',
    'xgboost': 'test_xgb.npy',
    'catboost': 'test_cbt.npy',
    'logistic_regression1': 'test_lr.npy',
    'logistic_regression2': 'test_preds_lr_0.npy',
    'random_forest': 'test_rf.npy',
    'tabnet1': 'test_tabnet_0.npy',
    'tabnet2': 'test_tabnet_1.npy',
    'histgradient1': 'test_preds_hgb_0.npy',
    'histgradient2': 'test_preds_hgb_1.npy',
    'keras1': 'test_keras_0.npy',
    'keras2': 'test_keras_1.npy'
}

In [None]:
from pathlib import Path
INPUT_PATH = Path("../input/tabular-playground-series-mar-2021")

TRAIN_PATH = Path("train")
TEST_PATH = Path("../input/tps-mar-2021-preprocessed-data/preprocessed-data/test")


In [None]:
TRAIN_PATH

In [None]:
train_df = pd.read_csv(INPUT_PATH / "train.csv")
test_df = pd.read_csv(INPUT_PATH / "test.csv")
sub_df = pd.read_csv(INPUT_PATH / 'sample_submission.csv')

In [None]:
oof_df = pd.DataFrame()
preds_df = pd.DataFrame()

for name, train_oof in train_oof_dict.items():
    oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)
    
for name, test_pred in test_pred_dict.items():
    preds_df = pd.concat([preds_df, pd.Series(np.load(TEST_PATH / test_pred), name=name)], axis=1)