
#  <font color='blue'> Drug expenditure in France - Data Preparation </font>

# I hope you find this kernel useful
# Your <font color='red'> UPVOTES </font> would be highly appreciated

In [None]:
import numpy as np 
import pandas as pd 

#  <font color='green'> Import Datasets </font>

 <font color='black'>We will use 3 datasets : OPEN MEDIC 2017, OPEN MEDIC 2018, and OPEN MEDIC 2019.</font>

In [None]:
data_2017 = pd.read_csv('../input/open-medic-2017/OPEN_MEDIC_2017.CSV',sep=';',encoding='latin-1')
data_2018 = pd.read_csv('../input/open-medic-2018/OPEN_MEDIC_2018.CSV',sep=';',encoding='latin-1')
data_2019 = pd.read_csv('../input/open-medic-2019/OPEN_MEDIC_2019.CSV',sep=';',encoding='latin-1')


<font color='black'>Let's take a first look at our Datasets</font>

In [None]:
data_2017.shape

In [None]:
data_2018.shape

In [None]:
data_2019.shape


 <font color='black'>-> So we have more thant 5.5 million rows and 21 columns</font>


<font color='black'>Let's show a sample of our data.</font>

In [None]:
data_2017.head()

In [None]:
data_2018.head()

In [None]:
data_2019.head()


#  <font color='green'> Data Preparation </font>

<font color='black'>Now we will concatenate the 3 Datasets. </font>


<font color='black'>Add new column "Year" to sepcify the year 2017, 2018 and 2019</font>


In [None]:
data_2017["Year"]="2017"
data_2018["Year"]="2018"
data_2019["Year"]="2019"

<font color='red'> **Concatenate datasets** </font>


<font color='black'>First of all let's check that the 3 datasets have the same columns names.</font>


In [None]:
data_2017.columns

In [None]:
data_2018.columns

In [None]:
data_2019.columns


<font color='black'>-> Like we see there are some difference like age and AGE, L_CIP13, l_cip_13.</font>





<font color='black'>To Concatenate the 3 datasets we should have the same columns names, So let's do that in the next step. </font>



In [None]:
columns= ['ATC1', 'l_ATC1', 'ATC2', 'L_ATC2', 'ATC3', 'L_ATC3', 'ATC4', 'L_ATC4',
       'ATC5', 'L_ATC5', 'CIP13', 'L_CIP13', 'TOP_GEN', 'GEN_NUM', 'AGE',
       'SEXE', 'BEN_REG', 'PSP_SPE', 'BOITES', 'REM', 'BSE','YEAR']

In [None]:
data_2017.columns =  columns
data_2018.columns =  columns
data_2019.columns =  columns

In [None]:
data_concat = pd.concat([data_2017,data_2018,data_2019])

In [None]:
data_concat.shape

In [None]:
data_concat



<font color='black'>We have 5 million rows and 22 columns.

Now our data is ready for the next step. </font>






<font color='black'>What about checking missing Values ? </font>



In [None]:
#Percentage of NAN Values 
NAN = [(c, data_concat[c].isna().mean()*100) for c in data_concat]
NAN = pd.DataFrame(NAN, columns=["column_name", "percentage"])
NAN

<font color='red'> **Data selection** </font>

**In our case we will use 11 columns :**

1. L_ACT1: Anatomical Main Group Label
1. L_ACT2: The family of medicines
1. CIP13: Contains the drug name
1. AGE: Age at time of care
1. SEXE: Sex
1. BEN_REG: Beneficiary’s Region of Residence
1. PSP_SPE: Prescriber
1. REM: Amount Repaid
1. BSE: Repayment Basis
1. BOITES: Number of boxes delivered
1. YEAR: YEAR


<font color='black'>Let's drop the other columns </font>


In [None]:
data_concat =data_concat.drop(['ATC1','ATC2','ATC3','ATC4','ATC5','CIP13','L_ATC3','L_ATC4','L_ATC5','TOP_GEN','GEN_NUM'],axis=1)


In [None]:
data_concat.shape

<font color='red'> **Now we will make some values transformation** </font>

For exemple in the Sex column :
*  1 -> Masculin (Male)
*  2 -> Feminin (Female)
*  9 -> Inconnu (Unknown)

In [None]:
sex_map  = {1:'Masculin',2:'Feminin',9:'Inconnu'}
data_concat['SEXE'] = data_concat['SEXE'].map(sex_map)

In [None]:
data_concat['SEXE']

In [None]:
region_map = {5:'''Outre-mer ''',11:'Ile-de-France', 24:'Centre-Val de Loire', 27:'Bourgogne-Franche-Comté',
            28:'Normandie',32:'Nord-Pas-de-Calais-Picardie', 44:'Alsace-Champagne-Ardenne-Lorraine',
             52:'Pays de la Loire', 53:'Bretagne',75:'Aquitaine-Limousin-Poitou-Charentes',
             76:'Languedoc-Roussillon-Midi-Pyrénées',84:'Auvergne-Rhône-Alpes',93:'''Provence-Alpes-Côte d'Azur et Corse''',
            0:'Inconnue',9:'Inconnue',99:'Inconnue'}
data_concat['BEN_REG'] = data_concat['BEN_REG'].map(region_map)

In [None]:
data_concat['BEN_REG']

In [None]:
Prescriber_map = {1:'Médecine generale liberale',2:'Anesthésiste-réanimateur libéral', 
             3:'Pathologie cardio-vasculaire liberale', 4:'Chirurgie liberale',
            5:'Dermatologie et de vénéréologie liberale',6:'Radiologie liberale',
             7:'Gynecologie obstetrique liberale',
             8:'Gastro-entérologue et hepatologie liberale', 9:'Médecine interne libéral',
             11:'Oto rhino-laryngologie liberale',12:'Pédiatrie liberale',
             13:'Pneumologie liberale',
            14:'Rhumatologie liberale',15:'Ophtalmologie liberale',17:'Psychiatrie liberale' ,            
             18:'Stomatologie liberale',31:'Médecine physique et de réadaptation libérale',
             32:'Neurologie libérale',
             35:'Nephrologie libérale',37:'Anatomie-cytologie-pathologique libérale',
             38:'Directeur laboratoire médecin libéral',
             42:'Endocrinologie et metabolismes libéral',90:'Prescripteurs salaries',
             98:'Dentistes, Auxiliaires médicaux, Laboratoires, Sages-femmes',99:'Valeur inconnue',
}
data_concat['PSP_SPE'] = data_concat['PSP_SPE'].map(Prescriber_map)


In [None]:
data_concat['PSP_SPE'] 

In [None]:
age_map = {0:'0 to 19 years',20:'20 to 59 years', 60:'Over 60 years', 99:'Age Inconnu'}
data_concat['AGE'] = data_concat['AGE'].map(age_map)


In [None]:
data_concat['AGE']

In [None]:
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('''MEDICAMENTS POUR LES TROUBLES DE L'ACIDITE''','''TROUBLES DE L'ACIDITE''') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('''MEDICAMENTS POUR LES TROUBLES FONCTIONNELS GASTROINTESTINAUX''','''TROUBLES FONCTIONNELS GASTROINTESTINAUX''') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('MEDICAMENTS DE LA DIGESTION, ENZYMES INCLUSES','DIGESTION, ENZYMES INCLUSES') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('MEDICAMENTS DU DIABETE','DIABETE') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('AUTRES MEDICAMENTS DES VOIES DIGESTIVES ET DU METABOLISME','AUTRES DES VOIES DIGESTIVES ET DU METABOLISME') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('MEDICAMENTS AGISSANT SUR LE SYSTEME RENINE-ANGIOTENSINE','SYSTEME RENINE-ANGIOTENSINE') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('MEDICAMENTS CONTRE LE PSORIASIS','CONTRE LE PSORIASIS') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('MEDICAMENTS UROLOGIQUES','UROLOGIQUES') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('TOPIQUES POUR DOULEURS ARTICULAIRES OU MUSCULAIRES','DOULEURS ARTICULAIRES OU MUSCULAIRES') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('MEDICAMENTS POUR LE TRAITEMENT DES DESORDRES OSSEUX','TRAITEMENT DES DESORDRES OSSEUX') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('AUTRES MEDICAMENTS DES DESORDRES MUSCULOSQUELETTIQUES','AUTRES DES DESORDRES MUSCULOSQUELETTIQUES') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('AUTRES MEDICAMENTS DU SYSTEME NERVEUX','AUTRES DU SYSTEME NERVEUX') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('MEDICAMENTS POUR LES SYNDROMES OBSTRUCTIFS DES VOIES AERIENNES','SYNDROMES OBSTRUCTIFS DES VOIES AERIENNES') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('MEDICAMENTS DU RHUME ET DE LA TOUX','RHUME ET TOUX') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('''AUTRES MEDICAMENTS DE L'APPAREIL RESPIRATOIRE''','''AUTRES DE L'APPAREIL RESPIRATOIRE''') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('MEDICAMENTS OPHTALMOLOGIQUES','OPHTALMOLOGIQUES') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('MEDICAMENTS OTOLOGIQUES','OTOLOGIQUES') 
data_concat["L_ATC2"]= data_concat["L_ATC2"].replace('MEDICAMENTS POUR DIAGNOSTIC','DIAGNOSTIC') 

In [None]:
data_concat["L_ATC2"]

Extract the Drug Name from 'L_CIP13' column.

In [None]:
new_cip=[]
for cip in data_concat["L_CIP13"]:
    i = 0
    while ((i < len(cip)) and (not(cip[i].isdigit()))):
        i= i+1
    new_cip.append(cip[:i-1].capitalize() )



<font color='gris'>Create the drug name column and drop the old one </font>


In [None]:
data_concat['Drug_Name']=new_cip

In [None]:
data_concat =data_concat.drop(['L_CIP13'],axis=1)

In [None]:
data_concat['Drug_Name']

<font color='red'> **Data after preparation** </font>

In [None]:
data_concat.head()

# Stay tuned for the next updates and do not forget the <font color='red'> UPVOTE </font> :)