In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('clean_practo.csv')
df

Unnamed: 0,Speciality,Year_of_experience,Location,City,dp_score,consultation_fee,npv(votes),Degree
0,Dietitian/Nutritionist,4,Koramangala 7 Block,Bangalore,100,600,40,BSc
1,Dietitian/Nutritionist,14,Bandra West,Mumbai,96,2500,222,BSc
2,Dietitian/Nutritionist,16,Kandivali West,Mumbai,99,0,96,BSc
3,Dietitian/Nutritionist,15,Koramangala 3 Block,Bangalore,97,1770,270,BSc
4,Dietitian/Nutritionist,1,Koramangala 5 Block,Bangalore,99,600,121,BSc
...,...,...,...,...,...,...,...,...
5981,Dietitian/Nutritionist,17,Greater Kailash Part 1,Delhi,97,300,34,PG
5982,Dietitian/Nutritionist,26,Kandivali West,Mumbai,96,500,168,PG
5983,Dietitian/Nutritionist,24,Kandivali West,Mumbai,93,0,15,PG
5984,Dietitian/Nutritionist,13,Shalimar Bagh,Delhi,92,450,37,PG


#### Outliers Analysis:

In [3]:
for i in df.select_dtypes(include='number'):
    if i!='Consultation fee' :
        q1=df[i].quantile(0.25)
        q3=df[i].quantile(0.75)
        iqr=q3-q1
        df=df[(df[i]>=q1-1.5*iqr) & (df[i]<=q3+1.5*iqr)]

In [4]:
df.shape

(4578, 8)

#### Encoding:

In [5]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [6]:
# LabelEncoding
col=['Degree','Location','Speciality','City']
for j in col:
    df[j]=le.fit_transform(df[j])

In [7]:
df.head()

Unnamed: 0,Speciality,Year_of_experience,Location,City,dp_score,consultation_fee,npv(votes),Degree
0,5,4,214,0,100,600,40,5
2,5,16,192,2,99,0,96,5
4,5,1,212,0,99,600,121,5
5,5,14,32,0,97,850,117,5
7,5,23,251,0,100,1000,106,0


In [8]:
#OneHotEncoding
#df = pd.get_dummies(df, columns = ['Speciality','City'])

In [9]:
df


Unnamed: 0,Speciality,Year_of_experience,Location,City,dp_score,consultation_fee,npv(votes),Degree
0,5,4,214,0,100,600,40,5
2,5,16,192,2,99,0,96,5
4,5,1,212,0,99,600,121,5
5,5,14,32,0,97,850,117,5
7,5,23,251,0,100,1000,106,0
...,...,...,...,...,...,...,...,...
5981,5,17,115,1,97,300,34,27
5982,5,26,192,2,96,500,168,27
5983,5,24,192,2,93,0,15,27
5984,5,13,371,1,92,450,37,27


#### Scaling:

In [10]:
x =df.drop(['consultation_fee'], axis=1)
y =df['consultation_fee']

In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [12]:
x = sc.fit_transform(x)

## Linear Regression

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [14]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()

In [15]:
lr.fit(x_train, y_train)

In [16]:
y_pred_lr=lr.predict(x_test)

In [17]:
from sklearn import metrics

In [18]:
# Mean Square Error(MSE)
MSE=metrics.mean_squared_error(y_test, y_pred_lr)
print('MSE =',MSE)
# Root mean square error(RSME)
RMSE=np.sqrt(MSE)
print('RMSE =',RMSE)
# Coefficient of determination or R-squared
R2=metrics.r2_score(y_test,y_pred_lr)
print('R-squared =',R2)

MSE = 83034.26667191245
RMSE = 288.1566703581794
R-squared = 0.2925402029065314


In [19]:
print('Coefficients:',lr.coef_)
print('Intercept:',lr.intercept_)

Coefficients: [ 96.81949896  73.18423086  28.19121755 100.67194652 -39.64462451
 -13.05059932  72.90803021]
Intercept: 744.7062651427473


In [20]:
import pickle

In [21]:
# pickle.dump(lr,open('LR_fee_predict.pkl','wb'))

## Random Forest Regressor

In [22]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100)

In [23]:
rfr.fit(x_train, y_train)

In [24]:
y_pred_rfr = rfr.predict(x_test)

In [25]:
# Mean Square Error(MSE)
MSE=metrics.mean_squared_error(y_test, y_pred_rfr)
print('MSE =',MSE)
# Root mean square error(RSME)
RMSE=np.sqrt(MSE)
print('RMSE =',RMSE)
# Coefficient of determination or R-squared
R2=metrics.r2_score(y_test,y_pred_rfr)
print('R-squared =',R2)

MSE = 54214.27496505741
RMSE = 232.83959063066877
R-squared = 0.5380892551517773


In [26]:
# pickle.dump(rfr,open('RFR_fee_predict.pkl','wb'))

## Gradient Boosting Regressor

In [27]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=100)

In [28]:
gbr.fit(x_train, y_train)

In [29]:
y_pred_gbr = gbr.predict(x_test)

In [30]:
# Mean Square Error(MSE)
MSE=metrics.mean_squared_error(y_test, y_pred_gbr)
print('MSE =',MSE)
# Root mean square error(RSME)
RMSE=np.sqrt(MSE)
print('RMSE =',RMSE)
# Coefficient of determination or R-squared
R2=metrics.r2_score(y_test,y_pred_gbr)
print('R-squared =',R2)

MSE = 54015.22677248151
RMSE = 232.41176126108917
R-squared = 0.5397851645585283


In [31]:
# pickle.dump(gbr,open('GBR_fee_predict.pkl','wb'))

## Decision Tree Regresser:

In [32]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(max_depth=3)

In [33]:
dtr.fit(x_train, y_train)

In [34]:
y_pred_dtr = dtr.predict(x_test)

In [35]:
# Mean Square Error(MSE)
MSE=metrics.mean_squared_error(y_test, y_pred_dtr)
print('MSE =',MSE)
# Root mean square error(RSME)
RMSE=np.sqrt(MSE)
print('RMSE =',RMSE)
# Coefficient of determination or R-squared
R2=metrics.r2_score(y_test,y_pred_dtr)
print('R-squared =',R2)

MSE = 67793.32715594812
RMSE = 260.37151755894524
R-squared = 0.42239444754123767


In [36]:
# pickle.dump(dtr,open('DTR_fee_predict.pkl','wb'))

## AdaBoost Regressor

In [37]:
from sklearn.ensemble import AdaBoostRegressor
tree_regressor = DecisionTreeRegressor(max_depth=3)
ada_r = AdaBoostRegressor(base_estimator=tree_regressor, n_estimators=100, learning_rate=0.1)

In [38]:
ada_r.fit(x_train, y_train)



In [39]:
y_pred_ada = ada_r.predict(x_test)

In [40]:
# Mean Square Error(MSE)
MSE=metrics.mean_squared_error(y_test, y_pred_ada)
print('MSE =',MSE)
# Root mean square error(RSME)
RMSE=np.sqrt(MSE)
print('RMSE =',RMSE)
# Coefficient of determination or R-squared
R2=metrics.r2_score(y_test,y_pred_ada)
print('R-squared =',R2)

MSE = 66776.45224915892
RMSE = 258.41140115938947
R-squared = 0.43105831782107085


In [41]:
# pickle.dump(ada,open('ADA_fee_predict.pkl','wb'))

## EXREMELY RANDOMIZED TREE

In [42]:
import time
from sklearn.ensemble import ExtraTreesRegressor
start_time = time.time()
end_time = time.time()
ert = ExtraTreesRegressor(max_depth=25,
                             n_estimators=400,
                             bootstrap=True,
                             max_samples=0.7)

In [43]:
ert.fit(x_train,y_train)
training_time = end_time - start_time
print("Training time:", training_time, "seconds")

Training time: 7.867813110351562e-06 seconds


In [44]:
y_pred_ert = ert.predict(x_test)

In [45]:
# Mean Square Error(MSE)
MSE=metrics.mean_squared_error(y_test, y_pred_ert)
print('MSE =',MSE)
# Root mean square error(RSME)
RMSE=np.sqrt(MSE)
print('RMSE =',RMSE)
# Coefficient of determination or R-squared
R2=metrics.r2_score(y_test,y_pred_ert)
print('R-squared =',R2)

MSE = 55821.47189960171
RMSE = 236.26568074860495
R-squared = 0.5243957854220498


In [46]:
# pickle.dump(ert,open('ERT_fee_predict.pkl','wb'))

In [47]:
# model=pickle.load(open('ERT_fee_predict.pkl','rb'))