In [98]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder
import pickle
import warnings
warnings.filterwarnings('ignore')

### Feature Encoding

1. In the dataset we have features with high cardinality and feature with less categories as well. 
2. Categorical Features can be categorised into two types ordinal and nominal.
3. It is advised to encode nominal features with One-hot or Binary Encoding. Which makes sure that model will not unnecessary attach anykind of patterns with the independent and dependent feature.
4. While ordinal Features are advised to encode using ordinal encoding which is nothing but label encoding in which we preserve hierarchy between the classes which helps model to find pattern between the hirerachy and the value of dependent feature. 
5. It is important to understand that model will learn the patterns only if they exists, if they dont it wont matter if we encode features using Binary Encoding or Ordinal Encoding.
6. Considering this, if we choose to encode all the features using Label Encoding it might just workout.
7. The advantage of doing this is our dataset has lot of categorical features with high cardinality and if we choose to encode them using One-hot or Binary encoder it will lead to dimensionality curse.
    For Example : 
        If feature has 5 classes One-Hot will add 5 more columns to our dataset and Binary-Encoder (Which is better than One-Hot) will add 3 Columns. This is just for 5 classes, we have features with almost 5000 unique classes.
8. Another way of encoding features with high cardinality is using Target Encoder, but it has a problem if we encounter a class while testing which was not part of our dataset while encoding the encoder will throw an error.


In [99]:
df = pd.read_csv('../../data/post_eda.csv')

df.head()

Unnamed: 0,Title,Sponsor,Sponsor_Class,Conditions,Study_Type,Sex,Age_Category,City,Country,Results,Interventions,min_age,start_date_day,start_date_month,start_date_year,first_post_date_day,first_post_date_month,first_post_year
0,A Pilot Study of the Effects of Diet and Behav...,"University of California, San Francisco",OTHER,Diabetes Type II,INTERVENTIONAL,ALL,ADULT,San Francisco,United States,Passed,Low Carbohydrate Diet,18,1,10,2012,25,10,2012
1,The Role of Traditional or Western Diet in the...,University of Copenhagen,OTHER,T2D,INTERVENTIONAL,ALL,ADULT,Copenhagen,Denmark,Passed,Cross-over study,18,15,4,2019,9,7,2019
2,Effect of Coherent Breathing on Elderly Qualit...,Cairo University,OTHER,Stable Diabetes Mellitus,INTERVENTIONAL,ALL,ADULT,Cairo,Egypt,Passed,Coherent Breathing Exercise,60,1,3,2023,14,3,2023
3,Peanut Consumption and Cardiovascular Disease ...,Harvard School of Public Health (HSPH),OTHER,T2D,INTERVENTIONAL,ALL,ADULT,Shanghai,China,Passed,Peanut,20,24,10,2017,21,6,2017
4,Dose Response of Eccentric Exercise on Glycemi...,Riphah International University,OTHER,Type II Diabetes Mellitus,INTERVENTIONAL,ALL,ADULT,Lahore,Pakistan,Passed,Eccentric Exercises Dose,45,12,7,2021,10,3,2021


In [100]:
df.shape

(5739, 18)

In [101]:
num_cols = ['min_age']

cat_cols = [col for col in df.columns if df[col].dtype == 'O' or col not in num_cols]

cat_cols, num_cols

(['Title',
  'Sponsor',
  'Sponsor_Class',
  'Conditions',
  'Study_Type',
  'Sex',
  'Age_Category',
  'City',
  'Country',
  'Results',
  'Interventions',
  'start_date_day',
  'start_date_month',
  'start_date_year',
  'first_post_date_day',
  'first_post_date_month',
  'first_post_year'],
 ['min_age'])

In [102]:
df.head()

Unnamed: 0,Title,Sponsor,Sponsor_Class,Conditions,Study_Type,Sex,Age_Category,City,Country,Results,Interventions,min_age,start_date_day,start_date_month,start_date_year,first_post_date_day,first_post_date_month,first_post_year
0,A Pilot Study of the Effects of Diet and Behav...,"University of California, San Francisco",OTHER,Diabetes Type II,INTERVENTIONAL,ALL,ADULT,San Francisco,United States,Passed,Low Carbohydrate Diet,18,1,10,2012,25,10,2012
1,The Role of Traditional or Western Diet in the...,University of Copenhagen,OTHER,T2D,INTERVENTIONAL,ALL,ADULT,Copenhagen,Denmark,Passed,Cross-over study,18,15,4,2019,9,7,2019
2,Effect of Coherent Breathing on Elderly Qualit...,Cairo University,OTHER,Stable Diabetes Mellitus,INTERVENTIONAL,ALL,ADULT,Cairo,Egypt,Passed,Coherent Breathing Exercise,60,1,3,2023,14,3,2023
3,Peanut Consumption and Cardiovascular Disease ...,Harvard School of Public Health (HSPH),OTHER,T2D,INTERVENTIONAL,ALL,ADULT,Shanghai,China,Passed,Peanut,20,24,10,2017,21,6,2017
4,Dose Response of Eccentric Exercise on Glycemi...,Riphah International University,OTHER,Type II Diabetes Mellitus,INTERVENTIONAL,ALL,ADULT,Lahore,Pakistan,Passed,Eccentric Exercises Dose,45,12,7,2021,10,3,2021


In [103]:
res_map = {
    'Passed' : 1,
    'Failed' : 0
}

In [104]:
for col in cat_cols:

    if col != 'Results':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

df['Results'] = df['Results'].map(res_map)

df.head()

Unnamed: 0,Title,Sponsor,Sponsor_Class,Conditions,Study_Type,Sex,Age_Category,City,Country,Results,Interventions,min_age,start_date_day,start_date_month,start_date_year,first_post_date_day,first_post_date_month,first_post_year
0,267,1341,5,150,0,0,0,1134,99,1,1877,18,0,9,20,24,9,13
1,5409,1352,5,510,0,0,0,376,25,1,675,18,14,3,27,8,6,20
2,2006,176,5,503,0,0,0,285,26,1,597,60,0,2,31,13,2,24
3,4078,473,5,510,0,0,0,1185,16,1,2352,20,23,9,25,20,5,18
4,1878,1020,5,557,0,0,0,777,69,1,920,45,11,6,29,9,2,22


### Feature Selection

There are mutiple ways to perform feature selection such as 
1. Filter Methods
    <p>a. Pearson's correlation : which provides values between -1 to +1, value closer to 0 means feature is not linearly correlted to dependent feature. And values closer to -1 to +1 indicates feature being highly correlated to dependent feature. This can be used when we are using Linear models as one of the assumptions of linear models is that dependent and independet features have linear relationship between them.</p>
    <p>b. Chi-squre Test : This test is used to determine whether there is a significant association or relationship between the two categorical features. This test can be very useful for categorical features</p>
2. Wrapper Methods
    <p>a. Forward Selection : This is a simple algorithm in which we keep adding on features considering there significance using some metrics till we get desired number of features.</p>
    <p>b. Backward Selection : This is opposite to forward selection, in which we perform the same algorithm as forward selection only difference is that we start with all the features and start dropping features with least significance till we get desired number of features</p>
    <p>c. Exhaustive Search : In this one we try all the possible combinations of features and gives best subset of features. This may seem the best solution on paper but it is computationally expensive.</p>
3. Embedde methods : 
    <p>a. Lasso : This is one of the regularisation techniques which is used for feature selection. which works on the concept of 
    (lambda) * (slope) where slope indicates change in value of independent feature with respect to dependent feature and lambda is a hyperparameter. As the value of lambda increases features with lower value of slope will become zero and we will end up getting features with more significance</p>
    <p>b. Feature Importance : This works on the concept of building a decsion tree, As we build a decision tree depending on the algorithm we use ID3 or CART, we calculate values for GINI INDEX and INFORMATION GAIN using ENTROPY respectively both these values signify the importance of a node/feature. So, scikit-learn provides a inbuilt method called feature_importance which provides values which provides values for all the features signifying there importance. This methods goes well with Tree Based models</p>

We will be using two methods **Chi-square test** and **Feature Importance**.

### Chi-Square

In [105]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [106]:
categories = [col for col in cat_cols if col != 'Results']

categories

['Title',
 'Sponsor',
 'Sponsor_Class',
 'Conditions',
 'Study_Type',
 'Sex',
 'Age_Category',
 'City',
 'Country',
 'Interventions',
 'start_date_day',
 'start_date_month',
 'start_date_year',
 'first_post_date_day',
 'first_post_date_month',
 'first_post_year']

In [107]:
chi_scores, p_values = chi2(df[categories], df['Results'])

importance = pd.DataFrame({
    'Feature': categories,
    'Chi2 Score': chi_scores,
})

importance = importance.sort_values(by='Chi2 Score', ascending=False)

importance

Unnamed: 0,Feature,Chi2 Score
7,City,24684.766096
0,Title,24090.021829
1,Sponsor,14867.407553
8,Country,5722.923923
9,Interventions,1021.592428
3,Conditions,427.935319
2,Sponsor_Class,259.859975
15,first_post_year,161.61232
5,Sex,109.657188
4,Study_Type,71.571493


In [108]:
chi2_feat = list(importance['Feature'][0:13])

chi2_feat.append('Results')

chi2_feat

['City',
 'Title',
 'Sponsor',
 'Country',
 'Interventions',
 'Conditions',
 'Sponsor_Class',
 'first_post_year',
 'Sex',
 'Study_Type',
 'start_date_year',
 'start_date_day',
 'start_date_month',
 'Results']

### Feature Importance

In [109]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [110]:
X = df.drop('Results', axis=1)
y = df['Results']

X_train, y_train = X, y

In [111]:
rfc = RandomForestClassifier(n_estimators=100, random_state=15)

rfc.fit(X_train, y_train)

feat_imp = rfc.feature_importances_

In [112]:
feat_imp

array([0.09192519, 0.11653065, 0.05104788, 0.05435798, 0.0093941 ,
       0.0089767 , 0.00379831, 0.08740249, 0.11898399, 0.08836443,
       0.03297579, 0.02932277, 0.04504569, 0.08291147, 0.06079882,
       0.04576811, 0.07239562])

In [113]:
feat_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feat_imp,
})


feat_importance.sort_values('Importance', ascending=False)

Unnamed: 0,Feature,Importance
8,Country,0.118984
1,Sponsor,0.116531
0,Title,0.091925
9,Interventions,0.088364
7,City,0.087402
13,start_date_year,0.082911
16,first_post_year,0.072396
14,first_post_date_day,0.060799
3,Conditions,0.054358
2,Sponsor_Class,0.051048


In [114]:
feat_imp_list = list(feat_importance.sort_values('Importance', ascending=False)['Feature'][0:14])

feat_imp_list.append('Results')

feat_imp_list

['Country',
 'Sponsor',
 'Title',
 'Interventions',
 'City',
 'start_date_year',
 'first_post_year',
 'first_post_date_day',
 'Conditions',
 'Sponsor_Class',
 'first_post_date_month',
 'start_date_month',
 'min_age',
 'start_date_day',
 'Results']

##### Creating two datasets one with outliers and without ouliers

In [115]:
dataframes = {
}

In [116]:
df.shape

(5739, 18)

In [117]:
df['min_age'].describe()

count    5739.000000
mean       22.619969
std         9.743666
min         0.000000
25%        18.000000
50%        18.000000
75%        21.000000
max        75.000000
Name: min_age, dtype: float64

In [118]:
q1 = df['min_age'].describe()[4]
q3 = df['min_age'].describe()[6]

iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)

lower_bound, upper_bound

(13.5, 25.5)

In [119]:
df_with_out = df[(df['min_age'] >= lower_bound) & (df['min_age'] <= upper_bound)].copy(deep=True)

df_with_out

Unnamed: 0,Title,Sponsor,Sponsor_Class,Conditions,Study_Type,Sex,Age_Category,City,Country,Results,Interventions,min_age,start_date_day,start_date_month,start_date_year,first_post_date_day,first_post_date_month,first_post_year
0,267,1341,5,150,0,0,0,1134,99,1,1877,18,0,9,20,24,9,13
1,5409,1352,5,510,0,0,0,376,25,1,675,18,14,3,27,8,6,20
3,4078,473,5,510,0,0,0,1185,16,1,2352,20,23,9,25,20,5,18
6,4549,431,2,557,0,0,0,1079,69,1,946,18,0,5,27,10,11,21
7,1373,754,2,557,0,0,0,903,79,0,2846,18,6,4,21,3,5,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5732,3269,1159,5,573,1,0,0,6,25,1,3053,18,15,4,26,17,5,19
5733,2767,81,2,260,0,0,0,209,99,1,108,21,25,0,29,19,9,21
5734,3962,1365,5,403,0,0,0,558,14,1,232,18,11,9,25,13,4,19
5737,658,274,2,92,0,0,0,1037,99,1,2565,18,14,10,26,4,9,19


In [120]:
dataframes['feat_imp_with_out'] = df[feat_imp_list]
dataframes['feat_imp_without_out'] = df_with_out[feat_imp_list]
dataframes['chi2_with_out'] = df[chi2_feat]
dataframes['chi2_without_out'] = df_with_out[chi2_feat]

In [121]:
with open('../../data/dataframes.pkl', 'wb') as file:
    pickle.dump(dataframes, file=file)