# Problem statement:
This dataset has 7 columns and 5961 rows. our task is to analyze the dataset and predict the Consultation Fees of Doctor by developing a supervised machine learning model.

In [9]:
# Import all the required liabrary
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import re
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.feature_selection import RFE

In [53]:
#Load Train & Test Data Set
train = pd.read_excel('Final_Train.xlsx')
test = pd.read_excel('Final_Test.xlsx')

# Analyzing the data:
Let’s start analyzing the data provided. We need to know the number of columns, rows, null objects, etc. So, let’s open the data and start counting the columns, get the row count, go through each row to find out any special characters or null values in it, find out the data type of each column provided.

In [54]:
train.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees
0,"BHMS, MD - Homeopathy",24 years experience,100%,"Kakkanad, Ernakulam",Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100
1,"BAMS, MD - Ayurveda Medicine",12 years experience,98%,"Whitefield, Bangalore",Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350
2,"MBBS, MS - Otorhinolaryngology",9 years experience,,"Mathikere - BEL, Bangalore",ENT Specialist,,300
3,"BSc - Zoology, BAMS",12 years experience,,"Bannerghatta Road, Bangalore",Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250
4,BAMS,20 years experience,100%,"Keelkattalai, Chennai",Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250


In [55]:
#check the shape of data
train.shape

(5961, 7)

In [56]:
#check the shape of data
test.shape

(1987, 6)

In [57]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5961 entries, 0 to 5960
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Qualification       5961 non-null   object
 1   Experience          5961 non-null   object
 2   Rating              2659 non-null   object
 3   Place               5936 non-null   object
 4   Profile             5961 non-null   object
 5   Miscellaneous_Info  3341 non-null   object
 6   Fees                5961 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 326.1+ KB


In [58]:
print('Qualification:', train['Qualification'].nunique())
print('Experience:', train['Experience'].nunique())
print('Rating:', train['Rating'].nunique())
print('Place:', train['Place'].nunique())
print('Profile', train['Profile'].nunique())

Qualification: 1420
Experience: 64
Rating: 51
Place: 877
Profile 6


# Feature Engineering:
Machine Learning model requires input data in numerical notations to extract patterns from it and make predictions. But, not all the data provided in our source dataset is numerical. the data provided in object column is Categorical data. we need to convert these into numerical notations. Here data is nothing but a feature that our model uses as an input. So, we perform Feature Engineering on our data to create meaningful numerical data out of the source dataset.

In [59]:
df_train = train[['Qualification', 'Profile', 'Experience', 'Place', 'Miscellaneous_Info', 'Rating','Fees']]
df_test = test[['Qualification', 'Profile','Experience', 'Place', 'Miscellaneous_Info', 'Rating']]

In [60]:
df_train.head()

Unnamed: 0,Qualification,Profile,Experience,Place,Miscellaneous_Info,Rating,Fees
0,"BHMS, MD - Homeopathy",Homeopath,24 years experience,"Kakkanad, Ernakulam","100% 16 Feedback Kakkanad, Ernakulam",100%,100
1,"BAMS, MD - Ayurveda Medicine",Ayurveda,12 years experience,"Whitefield, Bangalore","98% 76 Feedback Whitefield, Bangalore",98%,350
2,"MBBS, MS - Otorhinolaryngology",ENT Specialist,9 years experience,"Mathikere - BEL, Bangalore",,,300
3,"BSc - Zoology, BAMS",Ayurveda,12 years experience,"Bannerghatta Road, Bangalore","Bannerghatta Road, Bangalore ₹250 Available on...",,250
4,BAMS,Ayurveda,20 years experience,"Keelkattalai, Chennai","100% 4 Feedback Keelkattalai, Chennai",100%,250


In [61]:
df_train['Experience'] = df_train['Experience'].str.replace('years experience','').astype(int)
df_test['Experience'] = df_test['Experience'].str.replace('years experience','').astype(int)

In [63]:
#Check if any missing value present in train data
df_train.isnull().sum()

Qualification            0
Profile                  0
Experience               0
Place                   25
Miscellaneous_Info    2620
Rating                3302
Fees                     0
dtype: int64

In [64]:
#Check if any missing value present in test data
df_test.isnull().sum()

Qualification            0
Profile                  0
Experience               0
Place                    6
Miscellaneous_Info     834
Rating                1090
dtype: int64

In [65]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

df_train['Qualification'] = df_train['Qualification'].apply(clean_text)
df_test['Qualification'] = df_test['Qualification'].apply(clean_text)

def clean_place(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

df_train['Place'].fillna('missing', inplace=True)
df_test['Place'].fillna('missing', inplace=True)
df_train['Place'] = df_train['Place'].apply(clean_place)
df_test['Place'] = df_test['Place'].apply(clean_place)

In [66]:
df_train['Rating'].fillna('0%', inplace=True)
df_test['Rating'].fillna('0%', inplace=True)

def clean_rating(text):
    text = re.sub(r'%', '', str(text))
    return text

df_train['Rating'] = df_train['Rating'].apply(clean_rating)
df_train['Rating'] = df_train['Rating'].astype(int)

df_test['Rating'] = df_test['Rating'].apply(clean_rating)
df_test['Rating'] = df_test['Rating'].astype(int)

In [67]:
df_train['Miscellaneous_Info'].fillna('missing', inplace=True)
df_test['Miscellaneous_Info'].fillna('missing', inplace=True)

def get_feedback(feedback):
    feedback = re.findall(r'\d+ Feedback', str(feedback))
    if feedback == []:
        feedback = '0 Feedback'
        return feedback
    else:
        return feedback[0]

df_train['Feedback'] = df_train['Miscellaneous_Info'].apply(get_feedback)
df_train['Feedback'] = df_train['Feedback'].str.replace(' Feedback','')
df_train['Feedback'] = df_train['Feedback'].astype(int)

df_test['Feedback'] = df_test['Miscellaneous_Info'].apply(get_feedback)
df_test['Feedback'] = df_test['Feedback'].str.replace(' Feedback','')
df_test['Feedback'] = df_test['Feedback'].astype(int)

In [68]:
def get_fee(text):
    text = re.sub(r',', "", text)
    text = re.findall(r'₹\d+', text)
    if text != []:
        return text[0]
    else:
        return 0

df_train['Misc_Fees'] = df_train['Miscellaneous_Info'].apply(get_fee)
df_train['Misc_Fees'] = df_train['Misc_Fees'].str.replace('₹','')
df_train['Misc_Fees'].fillna(50, inplace=True)
df_train['Misc_Fees'] = df_train['Misc_Fees'].astype(int)

df_test['Misc_Fees'] = df_test['Miscellaneous_Info'].apply(get_fee)
df_test['Misc_Fees'] = df_test['Misc_Fees'].str.replace('₹','')
df_test['Misc_Fees'].fillna(50, inplace=True)
df_test['Misc_Fees'] = df_test['Misc_Fees'].astype(float)

In [69]:
def clean_misc(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

df_train['Miscellaneous_Info'] = df_train['Miscellaneous_Info'].apply(clean_misc)
df_test['Miscellaneous_Info'] = df_test['Miscellaneous_Info'].apply(clean_misc)

In [70]:
#Check if any missing value present in train data
df_train.isnull().sum()

Qualification         0
Profile               0
Experience            0
Place                 0
Miscellaneous_Info    0
Rating                0
Fees                  0
Feedback              0
Misc_Fees             0
dtype: int64

In [71]:
#Check if any missing value present in test data
df_test.isnull().sum()

Qualification         0
Profile               0
Experience            0
Place                 0
Miscellaneous_Info    0
Rating                0
Feedback              0
Misc_Fees             0
dtype: int64

In [72]:
df_train = pd.get_dummies(df_train, columns=['Profile'])
df_test  = pd.get_dummies(df_test,  columns=['Profile'])

In [74]:
df_train.shape

(5961, 14)

# Model Selection :
There are around 60+ predictive modeling algorithms that are available to choose from. We must have a good understanding of our data and the desired solution we are looking for, to narrow down our model selection. Here our goal is to predict the the fees by developing a supervised machine learning model and we want to identify a relation between our output (Fees) and remaining independent features . This scenario is a classic example of Regression (Predict the average price).

We are provided with 5961 rows of the labeled dataset tagged with the output column “Fees” to train our model. Whenever the dataset is labeled and output feature is known, we opt for the Supervised Learning Machine Learning technique. So, our use-case is a Supervised Learning plus Regression problem. Based on these criteria we can narrow down our choice of models to a few:

BaggingRegressor

GradientBoostingRegressor

RandomForestRegressor()

Identifying features:


# Here we are dividing our data into a dependent feature that is object as y and independent features as x

In [26]:
X = df_train.drop(labels=['Fees'], axis=1)
y = df_train['Fees'].values

# Splitting Data:
we split our data into a test set and train set using train_test_split() which helps to better train the model and test its performance. test_size=0.15 creates a test data set with 15% of the data we have and the remaining 85% percent will be our training set to train our model. This split function returns us a pair of the train set (x_train,y_train) and a pair of the test set (x_cv,y_cv).

In [75]:
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.25, random_state=1)

In [27]:
X_train.shape, y_train.shape, X_cv.shape, y_cv.shape

((4470, 13), (4470,), (1491, 13), (1491,))

In [28]:
X_train.columns

Index(['Qualification', 'Experience', 'Place', 'Miscellaneous_Info', 'Rating',
       'Feedback', 'Misc_Fees', 'Profile_Ayurveda', 'Profile_Dentist',
       'Profile_Dermatologists', 'Profile_ENT Specialist',
       'Profile_General Medicine', 'Profile_Homeopath'],
      dtype='object')

#### build model

In [29]:
X_train_qual = X_train['Qualification']
X_cv_qual = X_cv['Qualification']

X_train_place = X_train['Place']
X_cv_place = X_cv['Place']

X_train_misc = X_train['Miscellaneous_Info']
X_cv_misc = X_cv['Miscellaneous_Info']

from sklearn.feature_extraction.text import TfidfVectorizer

tf1 = TfidfVectorizer(ngram_range=(1,2), binary=True, token_pattern=r'\w{3,}')
X_train_qual = tf1.fit_transform(X_train_qual)
X_cv_qual = tf1.transform(X_cv_qual)

tf2 = TfidfVectorizer()
X_train_place = tf2.fit_transform(X_train_place)
X_cv_place = tf2.transform(X_cv_place)

tf3 = TfidfVectorizer(token_pattern=r'\w{1,}', ngram_range=(1,2))
X_train_misc = tf3.fit_transform(X_train_misc)
X_cv_misc = tf3.transform(X_cv_misc)

In [30]:
X_train_exp = np.array(X_train['Experience']).reshape(-1,1)
X_cv_exp = np.array(X_cv['Experience']).reshape(-1,1)

X_train_feedback = np.array(X_train['Feedback']).reshape(-1,1)
X_cv_feedback = np.array(X_cv['Feedback']).reshape(-1,1)

X_train_rating = np.array(X_train['Rating']).reshape(-1,1)
X_cv_rating = np.array(X_cv['Rating']).reshape(-1,1)

cols = ['Profile_Dentist', 'Profile_Dermatologists', 'Profile_ENT Specialist', 'Profile_General Medicine',
       'Profile_Homeopath']
X_train_prof = X_train[cols]
X_cv_prof = X_cv[cols]

X_train_miscfees = np.array(X_train['Misc_Fees']).reshape(-1,1)
X_cv_miscfees = np.array(X_cv['Misc_Fees']).reshape(-1,1)

In [31]:
from math import sqrt 
from sklearn.metrics import mean_squared_log_error, mean_squared_error

In [32]:
from scipy.sparse import hstack
merged_train = hstack((X_train_exp, X_train_qual, X_train_prof, X_train_place, X_train_rating, X_train_misc, X_train_miscfees))
merged_cv  = hstack((X_cv_exp, X_cv_qual, X_cv_prof, X_cv_place, X_cv_rating, X_cv_misc, X_cv_miscfees))

In [33]:
merged_train.shape, merged_cv.shape

((4470, 8600), (1491, 8600))

In [34]:
from sklearn.ensemble import BaggingRegressor
br = BaggingRegressor(base_estimator=None, 
                      n_estimators=80,  
                      max_samples=1.0, 
                      max_features=1.0, 
                      bootstrap=True, 
                      bootstrap_features=True,
                      oob_score=True,
                      n_jobs=None, 
                      random_state=13, 
                      verbose=0)
br.fit(merged_train, y_train)
y_pred5 = br.predict(merged_cv)
print('RMSLE:', sqrt(mean_squared_log_error(y_cv, y_pred5)))
print('RMSE:', sqrt(mean_squared_error(y_cv, y_pred5)))
#     0.58019310689049

RMSLE: 0.5801075405788017
RMSE: 160.75846788596746


In [35]:
from scipy.sparse import hstack
merged_train = hstack((X_train_exp, X_train_feedback, X_train_qual, X_train_prof, X_train_place, X_train_rating, X_train_misc))
merged_cv  = hstack((X_cv_exp, X_cv_feedback, X_cv_qual, X_cv_prof, X_cv_place, X_cv_rating, X_cv_misc))

In [36]:
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(loss='lad', 
                               learning_rate=0.2,  
                               random_state=10, 
                               n_estimators=92,   
                               max_depth=11,  
                               subsample=1.0, 
                               min_samples_split=40, 
                               min_samples_leaf=1,
                               max_features='auto')
gb.fit(merged_train, y_train)
y_pred3 = gb.predict(merged_cv)
print('RMSLE:', sqrt(mean_squared_log_error(y_cv, y_pred3)))

RMSLE: 0.5812561135063448


In [37]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=29,  
                           criterion='mse', 
                           max_depth=58,  
                           min_samples_split=5, 
                           min_samples_leaf=2, 
                           min_weight_fraction_leaf=0.0, 
                           max_features='auto', 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.20,  
                           bootstrap=True, 
                           oob_score=True, 
                           n_jobs=-1, 
                           random_state=11)  
rf.fit(merged_train, y_train)
y_pred4 = rf.predict(merged_cv)
print('RMSLE:', sqrt(mean_squared_log_error(y_cv, y_pred4))) 

RMSLE: 0.5928716390933947


#### Now, predict on test set

In [38]:
X_train = df_train.drop(labels='Fees', axis=1)
y_train = df_train['Fees'].values

X_test = df_test

In [39]:
X_train_qual = X_train['Qualification']
X_test_qual = X_test['Qualification']

X_train_place = X_train['Place']
X_test_place = X_test['Place']

X_train_misc = X_train['Miscellaneous_Info']
X_test_misc = X_test['Miscellaneous_Info']

from sklearn.feature_extraction.text import TfidfVectorizer

tf1 = TfidfVectorizer(ngram_range=(1,2), binary=True, token_pattern=r'\w{3,}')
X_train_qual = tf1.fit_transform(X_train_qual)
X_test_qual = tf1.transform(X_test_qual)

tf3 = TfidfVectorizer()
X_train_place = tf3.fit_transform(X_train_place)
X_test_place = tf3.transform(X_test_place)

tf4 = TfidfVectorizer(token_pattern=r'\w{1,}', ngram_range=(1,2))
X_train_misc = tf4.fit_transform(X_train_misc)
X_test_misc = tf4.transform(X_test_misc)

In [40]:
X_train_exp = np.array(X_train['Experience']).reshape(-1,1)
X_test_exp = np.array(X_test['Experience']).reshape(-1,1)

X_train_feedback = np.array(X_train['Feedback']).reshape(-1,1)
X_test_feedback = np.array(X_test['Feedback']).reshape(-1,1)

X_train_rating = np.array(X_train['Rating']).reshape(-1,1)
X_test_rating = np.array(X_test['Rating']).reshape(-1,1)

cols = ['Profile_Dentist', 'Profile_Dermatologists', 'Profile_ENT Specialist', 'Profile_General Medicine',
       'Profile_Homeopath']
X_train_prof = X_train[cols]
X_test_prof = X_test[cols]

X_train_miscfees = np.array(X_train['Misc_Fees']).reshape(-1,1)
X_test_miscfees = np.array(X_test['Misc_Fees']).reshape(-1,1)

In [41]:
from scipy.sparse import hstack
merged_train = hstack((X_train_exp, X_train_qual, X_train_prof, X_train_place, X_train_rating, X_train_misc, X_train_miscfees))
merged_test  = hstack((X_test_exp, X_test_qual, X_test_prof, X_test_place, X_test_rating, X_test_misc, X_test_miscfees))

In [44]:
merged_train.shape, merged_test.shape

((5961, 9982), (1987, 9982))

In [45]:
from sklearn.ensemble import BaggingRegressor
br = BaggingRegressor(base_estimator=None, 
                      n_estimators=80,  
                      max_samples=1.0, 
                      max_features=1.0, 
                      bootstrap=True, 
                      bootstrap_features=True,
                      oob_score=True,
                      n_jobs=None, 
                      random_state=13, 
                      verbose=0)
br.fit(merged_train, y_train)
y_pred5 = br.predict(merged_test)

In [46]:
from scipy.sparse import hstack
merged_train = hstack((X_train_exp, X_train_feedback, X_train_qual, X_train_prof, X_train_place, X_train_rating, X_train_misc))
merged_test  = hstack((X_test_exp, X_test_feedback, X_test_qual, X_test_prof, X_test_place, X_test_rating, X_test_misc))

In [47]:
merged_train.shape, merged_test.shape

((5961, 9982), (1987, 9982))

In [48]:
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(loss='lad', 
                               learning_rate=0.2, 
                               random_state=10, 
                               n_estimators=92,   
                               max_depth=11,
                               subsample=1.0, 
                               min_samples_split=40, 
                               min_samples_leaf=1,
                               max_features='auto')
gb.fit(merged_train, y_train)
y_pred3 = gb.predict(merged_test)

In [49]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=29,
                           criterion='mse', 
                           max_depth=58, 
                           min_samples_split=5, 
                           min_samples_leaf=2, 
                           min_weight_fraction_leaf=0.0, 
                           max_features='auto', 
                           max_leaf_nodes=None, 
                           min_impurity_decrease=0.20,
                           bootstrap=True, 
                           oob_score=True, 
                           n_jobs=-1, 
                           random_state=11) 
rf.fit(merged_train, y_train)
y_pred1 = rf.predict(merged_test).astype(int)

#### write predictions to output file

In [50]:
df_sub = pd.DataFrame(data=y_pred1, columns=['Fees'])
writer = pd.ExcelWriter('Final_Sub.xlsx', engine='xlsxwriter')
df_sub.to_excel(writer,sheet_name='Sheet1', index=False)
writer.save()

# Save the model using Pickle

In [76]:
# Save the model using pickle
import pickle

filename = "pickleRFRfile.pkl"
pickle.dump(rf, open(filename, 'wb'))


# Conclusion :
we achieved highest accuracy score of 59% by using Random Forest Regressor which is good for any machine learning prediction model particularly for a linear regression problem. Finally! we successfully created a Regression Machine Learning prediction model using Python and its powerful libraries which predicts the fees for consultation by doctor.

