In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# load the data
data = pd.read_csv('/kaggle/input/covid19-patient-precondition-dataset/covid.csv')
pd.set_option('max_columns', None)

In [None]:
# make a copy 
df = data.copy()
# Display the first 5 lines 
display(df.head())

**Notes on the dataset:**
* We have categorical variables already encoded.
* Serie 'sex': 1 for **female** and 2 for **male**.
* Serie 'patient_type': 1 for **Outpatient** and 2 for **Inpatient**
* Serie 'date_died': No precise indication for the value '9999-99-99'. Perhaps we can consider that the patient is still alive for this value.
* Rest of series: 1 for **yes**, 2 for **No** and 97, 98, 99 for not applicable or missing data.

# Data mining: to get information as much as possible

* Observation: 566602
* 23 columns with:
* id column
* Features: 21
* Target: 'covid_res'
* Variable types: 19 of type: integer, 4 of type: object
* Type: Classification.

In [None]:
# shape of dataset
print("Shape of dataframe:",df.shape)
print('_'*50)
# data types
print("\nVariable types:\n", df.dtypes.value_counts(), df.dtypes.value_counts().plot.pie())
plt.draw()

In [None]:
# Let's get some infos
print("\nData info:")
print( df.info())

In [None]:
# check the presence of NaN
df.isna().sum()

**visualisation of the Data**
* Here we will consider that 97, 98, 99 and '9999-99-99' as not applicable or missing values to get a better idea.
* Note that for 'age' serie, the 97, 98, 99 are simply the age of patient.

In [None]:
plt.figure(figsize = (16, 9))
sns.heatmap(df.isin([97, 98, 99, '9999-99-99']), cbar = False)
plt.title('Visulaisation of 97, 98, 99, and \'9999-99-99\' values', size=20)
plt.draw()

In [None]:
# Let's calculate the different pecentages
print("Pecentages:\n")
print(((df.isin([97, 98, 99, '9999-99-99']).sum()/df.shape[0] * 100).round(3)).sort_values(ascending = False) )

**In summary**:
* Presence of NaN: False. 
* Serie **'date_died'**: 93.6 % of values are '9999-99-99'. We should really thinking about transform this serie. For '9999-99-99' the patient is still alive for otherwise, unfortunately he is died.
* Series **'Intubed'** and **'ICU'(Intensive Care Unit)** gets 78.5%! And the missing values seems to be at the same places on the heatmap. So, is an intubated patient necessarily in intensive care? We should use a crosstab to verify.
* **'pregnancy'** a 51% ! Interesting. We should check by gender.
* **'contact_other_covid'** get 30% od missing or not applicable values. The rest seems to be good.

**Let's start with categorical comparison: crosstab**

In [None]:
pd.crosstab(df['intubed'], df['icu'])

* We can see that **'icu'** and **'intubed'** have the same number of values encoded at 97 and almost same for 99. This can explain why the shape of the **'icu'** and **'intubed'** series is almost same on th heatmap.
* Also, we can see that if the patient is intubated, he is not necessarily in intensive care.

In [None]:
# Go further and add patient type
pd.crosstab(df['patient_type'], [df['icu'], df['intubed']])

* Note that for **'icu'** and **'intubed'** the 97 values are encoded for 'patient_type = 1', i.e not hospitalized. Interesting, perhaps 97 is used for not applicable values and not for missing values. Perhaps we can replace this values by 2(no) during preprocessing and model development but this will strongly bias the dataset.

In [None]:
# with percentages
round(pd.crosstab(df['patient_type'], [df['icu'], df['intubed']], normalize = True).apply(lambda x: x * 100), 3)

* 78.48% (the 444689) are not hospitalized.
* Also, 0.02% are actually hospitalized but with missing value for intubed and icu.
* 21.52% of patients in the data are hospitalized and 87.54% of them are neither intubated nor in intensive care.

In [None]:
# contact_other_covid
round(pd.crosstab(df['covid_res'], df['contact_other_covid'], normalize = True).apply(lambda x: x * 100), 3)

* 21.65% of people have been in contact with covid and are tested negative against 13% who are positive!

In [None]:
# Series 'sex' et pregnancy'
# print(df['sex'].value_counts())
# % homme et femme
print('percentage of female:', round(df[df['sex'] == 1]['sex'].count() / df['sex'].count() * 100, 3), '%')
print('percentage of male:', round(df[df['sex'] == 2]['sex'].count() / df['sex'].count() * 100, 3), '%')

* Good for us, the data is well balanced regarding the gender.

In [None]:
pd.crosstab(df['sex'], df['pregnancy'])

* For **'pregnancy'**  the value 97 si used for male. So, this is not a missing value but actually **not applicable**.
* Perhaps during preprocessing and model development, we can replace this value by no(2). However, this will bias the data to not being pregnant.

In [None]:
# real percentage of missing values in serie 'pregnancy'
real_pregnancy_null = round( len(df[(df.sex == 1) & (df.pregnancy == 98)]) / len(df[df.sex == 1]) * 100, 3 )
print('percentage of missing values in pregnancy:' , real_pregnancy_null, '%')

In [None]:
# describe of the target
print('Target classes: covid_res\n', df['covid_res'].unique())
print("-"*50)
display(df['covid_res'].value_counts(normalize = True).to_frame())

**Notes**:
* covid_res: 3 classes: 1, 2, 3
* 49% tested negative for covid19 against 39% positif. Classes are relatively well balanced. 
* 12 % are waiting for results.

In [None]:
# describe sur les variables
df.drop(['covid_res'], axis =1, inplace = False).describe()

* **Note**: Except age, these values have no particular meaning
* 50% of patient are between 31 and 53 years old.
* Note **max_age** = 120 ans!

# Visualization of features

In [None]:
# in this panels, we will not consider the missing or not applicable values
df_cleaned = df[df.isin([97, 98, 99]) == False ]

plt.figure(figsize=(20,20))
for i in enumerate(df_cleaned.drop(['covid_res', 'age'], 1).select_dtypes(exclude = 'object')):
    plt.subplot(6,3,i[0]+1)
    sns.countplot(data =df_cleaned, x = i[1], hue='covid_res')
    plt.tight_layout() 

**Note**
* The majority of results are negative. Especially for 'pneumonia', 'diabetes', 'copd', 'asthma', 'inmsupr', 'hypertension', 'other_disease', 'cardiovascular', 'obesity', 'renal_chronic', 'tobacco'. 
* We will make two subsets, **healthy** for patient who get negative results for all diseases except covid 19 and **sick** for patient who get at least one disease.

In [None]:
df['healthy'] = ((df['pneumonia'] == 2) & 
                 (df['diabetes'] == 2) & 
                 (df['copd'] == 2) & 
                 (df['asthma'] == 2) & 
                 (df['inmsupr'] == 2) & 
                 (df['hypertension'] == 2) & 
                 (df['cardiovascular'] == 2) & 
                 (df['obesity'] == 2) & 
                 (df['renal_chronic'] == 2)
                )

display(df.head())
display(df['healthy'].value_counts())
print('Percentage:\n',round(df['healthy'].value_counts() / df['healthy'].count(), 3))

* **Note**:
    * 57.2% of patients do not present with any diseases listed above.
    * So this can be make some trouble for our model especially if we want it based on different clinical signs

**subsets healthy et sick**

In [None]:
healthy = df[df['healthy']]
sick    = df[~df['healthy']]

In [None]:
print('shape de healthy:', healthy.shape)
healthy['covid_res'].value_counts(normalize = True)

* **Note**:
    * 55.2% of healthy data do not get covid and 12% are waiting for results.

In [None]:
sick['covid_res'].value_counts(normalize = True)

In [None]:
print('shape de sick:', sick.shape)
sick[sick['covid_res'] != 3]['covid_res'].value_counts(normalize = True)

* **Note**: here patients have at least one disease and we do not consider those who are waiting for the covid result.
    * 53.4% of patients are positive for covid against 46.6%. This subset is relatively well balanced.
    * We should consider the sick data for our model.

**Serie Age**

In [None]:
# define some subsets
female = df[df['sex'] == 1]
male   = df[df['sex'] == 2]
female_positif = female[female['covid_res'] == 1]
female_negatif = female[female['covid_res'] == 2]
male_positif = male[male['covid_res'] == 1]
male_negatif = male[male['covid_res'] == 2]

In [None]:
plt.figure(figsize = (20,20))
plt.subplot(3,2,1)
sns.distplot(female['age'], kde = True, label = 'female', color = 'orange')
sns.distplot(male['age'], kde = True, label = '  male', color = 'blue')
plt.legend()

plt.subplot(3,2,2)
sns.distplot(df[df['covid_res']==1]['age'], kde = True, label = 'positif covid', color = 'red')
sns.distplot(df[df['covid_res']==2]['age'], kde = True, label = 'negatif covid', color = 'green')
plt.legend()

plt.subplot(3,2,3)
sns.kdeplot(female_positif['age'], label = 'female: positif covid', color = 'red')
sns.kdeplot(female_negatif['age'], label = 'female: negatif covid', color = 'green')
plt.legend()

plt.subplot(3,2,4)
sns.kdeplot(male_positif['age'], label = 'male: positif covid', color = 'red')
sns.kdeplot(male_negatif['age'], label = 'male: negatif covid', color = 'green')
plt.legend()

plt.subplot(3,2,5)
sns.kdeplot(female_positif['age'], label = 'positif covid: female', color = 'orange')
sns.kdeplot(male_positif['age'],   label = 'positif covid: male', color = 'blue')
plt.legend()

plt.subplot(3,2,6)
sns.kdeplot(female_negatif['age'], label = 'negatif covid: female', color = 'orange')
sns.kdeplot(male_negatif['age'], label   = 'negatif covid: male', color = 'blue')
plt.legend()

plt.tight_layout()

* Almost same distribusion regarding gender.
* Majority of patient are between 20 et 80 years old.
* A slight peak for low age values (children) with a positive covid test.

**Transform date columns to date format and numeric**

In [None]:
date_columns=['entry_date','date_symptoms', 'date_died']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')
    df[col] = pd.to_numeric(df[col],errors='coerce')


**correlation matrix**

In [None]:
plt.figure(figsize=(14,10))
sns.set(style = "white", font_scale = 1)
sns.heatmap(data = df.corr().round(2), cmap = "coolwarm", annot=True, annot_kws = {"size":12})
plt.tight_layout()
plt.show()

**Note:**
* It seems no correlation between features and our target 'covid_res'.
* The clinical signs are highly correlated with each other and with the tobacco variable.
* Some correlations are equal to 1!


# Definition of a class for preprocessing

In [None]:
class preprocessing:
    """
    
    """
    def __init__(self, dataframe):
        self.dataframe = dataframe
    
    def remove_serie(self, serie):
        # clean data: remove df[column] axis =1
        self.dataframe.drop([serie], axis = 1, inplace = True)
        return self.dataframe  
      
    def dates_to_numeric(self, list_dates):
        # convert the date series to numeric
        for col in list_dates:
            self.dataframe[col] = pd.to_datetime(self.dataframe[col], errors='coerce')
            self.dataframe[col] = pd.to_numeric( self.dataframe[col], errors='coerce')
        return self.dataframe
    
    def encode_date_died(self):
        # transform the date_died serie: 1 for yes and 2 for no (the '9999-99-99' values)

        check_presence =  self.dataframe['date_died'].isin(['9999-99-99']).sum()
        if check_presence ==  0:
            print('*'*102,'\n    WARNING! 9999-99-99 not found, you should remove date_died serie from the list in '
                  ' dates_to_numeric\n')
            print('*'*102)
            input('Press Enter to continue')

        self.dataframe = self.dataframe.replace('9999-99-99', 2)
        self.dataframe.loc[self.dataframe['date_died'] !=2, 'date_died'] = 1
        self.dataframe['date_died'] = self.dataframe['date_died'].astype('int64')             
        
        return self.dataframe

    def ignore_covid_waiting(self):
        # ignore covid_res = 3
        self.dataframe = self.dataframe[self.dataframe['covid_res'] != 3]
        display(self.dataframe['covid_res'].value_counts().to_frame('covid_res: number in each category'))
        return self.dataframe
    
    def sick(self):
        # ignore rows where values = 2 (false) for all diseases
        self.dataframe = self.dataframe[~((self.dataframe['pneumonia'] == 2) & 
                                          (self.dataframe['diabetes'] == 2) & 
                                          (self.dataframe['copd'] == 2) & 
                                          (self.dataframe['asthma'] == 2) & 
                                          (self.dataframe['inmsupr'] == 2) &
                                          (self.dataframe['hypertension'] == 2) & 
                                          (self.dataframe['cardiovascular'] == 2) & 
                                          (self.dataframe['obesity'] == 2) & 
                                          (self.dataframe['renal_chronic'] == 2))
                                       ]
        return self.dataframe
    
    def replace_NaN_values(self, list_values = [97, 98, 99]):
        for col in self.dataframe.columns:
            if (col == 'age'):
                continue
            elif col == 'pregnancy':
                self.dataframe[col] = self.dataframe[col].replace(98, 979899)
            else:
                self.dataframe[col] = self.dataframe[col].replace(list_values, 979899)
        return self.dataframe


    def edit_97_pregnancy(self, value = 2):
        # to replace the 97 by 'value' in pregnancy serie
        self.dataframe['pregnancy'] = self.dataframe['pregnancy'].replace(97, value)
        return self.dataframe
    
    def imputting_strategy(self, strategy = None):
        from sklearn.impute import SimpleImputer
        if strategy is None:
            strategy = 'most_frequent'
        print('Strategy pour imputtig =', strategy)

        imp = SimpleImputer(missing_values= 979899, strategy= strategy)
        self.dataframe = pd.DataFrame(imp.fit_transform(self.dataframe).astype('int64'), columns = self.dataframe.columns)
        return self.dataframe    
    

## function to evaluate the model

In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve

def evaluation(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    

    # learning curve
    scorer = f1_score(y_test, y_pred, average = 'weighted')
    N, train_score, val_score = learning_curve(model, X_train, y_train, cv = 10, 
                                               train_sizes =np.linspace(0.1, 1, 10),
                                                scoring = 'f1'
                                              )
    plt.figure(figsize=(12,8))
    plt.plot(N, train_score.mean(axis = 1), label = 'train_score')
    plt.plot(N, val_score.mean(axis = 1), label = 'validation_score')
    plt.legend()

**Preprocessing**
* **comment and/or uncomment the different methods in order to get the different iterations**

In [None]:
# restart from a good basis
df = data.copy()
# create an object
pr = preprocessing(df)

# clean data
df = pr.remove_serie('id')
df = pr.sick()
df = pr.ignore_covid_waiting()

# transform date series
df = pr.dates_to_numeric(['entry_date', 'date_symptoms', 'date_died'])

# # if you want to encode date_died, use this two lines below and comment the previous one
# df = pr.dates_to_numeric(['entry_date', 'date_symptoms'])
# df = pr.encode_date_died()

# same extrem values
# Replace Nan values [97,98,99] by one extreme value except for age and pregnancy series
df = pr.replace_NaN_values(list_values = [97, 98,99])
# edit serie pregnancy
# df = pr.edit_97_pregnancy(979899)

# imputting
df = pr.imputting_strategy('most_frequent')

**Split Train/Test set**


In [None]:
from sklearn.model_selection import train_test_split

target = 'covid_res'
X = df.drop([target], axis = 1)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# With PCA
from sklearn.decomposition import PCA
pca = PCA()
X_pca = pca.fit_transform(X)
print('Explained Variance:\n',pca.explained_variance_ratio_)

In [None]:
# Choose the number of components 
pca_n = PCA(n_components=1)
X_pca_n = pca_n.fit_transform(X)
print('Explained Variance:\n',pca_n.explained_variance_ratio_)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca_n, y, test_size = 0.2, random_state = 1)

## Models

In [None]:
# create classifier
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier

knn   = neighbors.KNeighborsClassifier()
RF = RandomForestClassifier(random_state = 1)

# Remarks and Results

* Strange but useful to note that the most important features according to RandomForest are **'entry_date', 'date_symptoms' and 'age'** perhaps this can be explained by the fact that the majority of test results are negative. For example, except covid-19, 57.2% of patients have a negative results for all diseases listed in the dataset.
* We will consider the **F1** score  because we aim to reduce both **false positives** and **false negatives**.
* The subset 'sick' is considered in the itérations **9**, **10** and **11**.
* Models are still in overfitting. However, **cross validation + PCA + Random forest** succeed to avoid this problem but we decrease the **F1** score for covid_res = 2.

* Score can be improved by:
* A better preprocessing (I am still begginer).
* Adjusting the model hyperparameters.
* Get other models.
* Get more data if possible.
* Get other features instead those we have. 

# KNeighbors vs randomForest, with and without PCA

 ## Itération 11

In [None]:
list_model = [knn, RF]

print("-"*100)
for model in list_model:
    print("Model:", model)
    print("-"*50)
    evaluation(model, X_train, X_test, y_train, y_test)
    if model == RF:
        print('Feature importances')
        pd.DataFrame(RF.feature_importances_, index = X_train.columns).plot.bar()

print("-"*100)

**With PCA**

In [None]:
list_model = [knn, RF]

print("-"*100)
for model in list_model:
    print("with PCA:", pca_n, "Model:", model)
    evaluation(model, X_train_pca, X_test_pca, y_train_pca, y_test_pca)
    if model == RF:
        print('Feature importances')
        pd.DataFrame(RF.feature_importances_).plot.bar()
print("-"*100)

 * **If you find anything helpful, an upvote will be appreciated.**
 * **Per advance, thanks for constructive remarks. Still beginer.**
 * **Special thanks for Machine Learnia who gives excellent tutorials. https://machinelearnia.com/**

# Different iterations considered

Iteration 1:
We do not touch the Dataset, we just transform the date columns into numeric by doing:
* df = pr.remove_serie('id)
* df = pr.dates_to_numeric(['entry_date', 'date_symptoms', 'date_died'])


Iteration 2:
Same as 1 but we ignore covid_res = 3:
*    df = pr.remove_serie('id')
*    df = pr.dates_to_numeric(['entry_date', 'date_symptoms', 'date_died'])
*    df = pr.ignore_covid_waiting()


Iteration 3:
same as 2 and we encode date_died

*    df = pr.remove_serie('id')
*    df = pr.dates_to_numeric(['entry_date', 'date_symptoms'])
*    df = pr.encode_date_died()
*    df = pr.ignore_covid_waiting()


Iteration 4: 
same as 3 and we replace the missing values by same extreme value

*    df = pr.remove_serie('id')
*    df = pr.dates_to_numeric(['entry_date', 'date_symptoms'])
*    df = pr.encode_date_died()
*    df = pr.ignore_covid_waiting()
*    df = pr.replace_NaN_values(list_values = [97,98,99])
*    df = pr.edit_97_pregnancy(979899)


Iteration 5: Imputing: 
here, as we know that for pregnancy serie, the value 97 is used for male, we do not have to replace it by median or other

*    df = pr.remove_serie('id')
*    df = pr.dates_to_numeric(['entry_date', 'date_symptoms', 'date_died'])
*    df = pr.ignore_covid_waiting()
*    df = pr.replace_NaN_values(list_values = [97, 98,99])
*    df = pr.imputting_strategy('most_frequent')


iteration 6 same as 5 but with median

Iteration 7 same as 5 plus encode date_died

*    df = pr.remove_serie('id')
*    df = pr.dates_to_numeric(['entry_date', 'date_symptoms'])
*    df = pr.encode_date_died()
*    df = pr.ignore_covid_waiting()
*    remplace [97,98,99] par extreme value except 97 in pregnancy
*    df = pr.replace_NaN_values(list_values = [97, 98,99])
*    df = pr.imputting_strategy('most_frequent')


iteration 8: 
Here, we replace the 97 value by 2 in pregnancy. Note this will add a bias in the data.
    
*    df = pr.remove_serie('id')
*    df = pr.dates_to_numeric(['entry_date', 'date_symptoms'])
*    df = pr.encode_date_died()
*    df = pr.ignore_covid_waiting()
*    df = pr.replace_NaN_values(list_values = [97, 98,99])
*    df = pr.edit_97_pregnancy(2)
*    df = pr.imputting_strategy('most_frequent')
    


Iteration 9
Here we will consider the subset sick

*    df = pr.remove_serie('id')
*    df = pr.sick()
*    df = pr.dates_to_numerique(['entry_date', 'date_symptoms', 'date_died'])
*    df = pr.ignoree_covid_waiting()

Iteration 10
Same as 9 plus encode date_died plus inputting 

*    df = pr.remove_serie('id')
*    df = pr.sick()
*    df = pr.dates_to_numeric(['entry_date', 'date_symptoms'])
*    df = pr.encode_date_died()
*    df = pr.ignore_covid_waiting()
*    df = pr.replace_NaN_values(list_values = [97, 98,99])
*    df = pr.imputting_strategy('most_frequent')


 Iteration 11
same as 9 plus inputing

*    df = pr.remove_serie('id')
*    df = pr.sick()
*    df = pr.dates_to_numerique(['entry_date', 'date_symptoms', 'date_died'])
*    df = pr.ignoree_covid_waiting()
*    df = pr.replace_NaN_values(list_values = [97, 98,99])
*    df = pr.imputting_strategy('most_frequent')
