In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score,GridSearchCV,RandomizedSearchCV,train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,accuracy_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRFRegressor,XGBRegressor

from mypipes import *

import warnings
warnings.filterwarnings('ignore') # ignore the warnings.

import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.6f} (std: {1:.6f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [3]:
datafile_train=r'D:/Pryanka/Edvancer/data/counterfeit_train.csv'
datafile_test=r'D:/Pryanka/Edvancer/data/counterfeit_test.csv'
bd_train=pd.read_csv(datafile_train)
bd_test=pd.read_csv(datafile_test)

In [4]:
bd_train.isnull().sum()

Medicine_ID               0
Counterfeit_Weight     1166
DistArea_ID               0
Active_Since              0
Medicine_MRP              0
Medicine_Type             0
SidEffect_Level           0
Availability_rating       0
Area_Type                 0
Area_City_Type            0
Area_dist_level           0
Counterfeit_Sales         0
dtype: int64

In [5]:
bd_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6818 entries, 0 to 6817
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Medicine_ID          6818 non-null   object 
 1   Counterfeit_Weight   5652 non-null   float64
 2   DistArea_ID          6818 non-null   object 
 3   Active_Since         6818 non-null   int64  
 4   Medicine_MRP         6818 non-null   float64
 5   Medicine_Type        6818 non-null   object 
 6   SidEffect_Level      6818 non-null   object 
 7   Availability_rating  6818 non-null   float64
 8   Area_Type            6818 non-null   object 
 9   Area_City_Type       6818 non-null   object 
 10  Area_dist_level      6818 non-null   object 
 11  Counterfeit_Sales    6818 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 639.3+ KB


In [6]:
bd_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1705 entries, 0 to 1704
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Medicine_ID          1705 non-null   object 
 1   Counterfeit_Weight   1408 non-null   float64
 2   DistArea_ID          1705 non-null   object 
 3   Active_Since         1705 non-null   int64  
 4   Medicine_MRP         1705 non-null   float64
 5   Medicine_Type        1705 non-null   object 
 6   SidEffect_Level      1705 non-null   object 
 7   Availability_rating  1705 non-null   float64
 8   Area_Type            1705 non-null   object 
 9   Area_City_Type       1705 non-null   object 
 10  Area_dist_level      1705 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 146.7+ KB


In [7]:
bd_train['Active_Since'] = 2023 - bd_train.Active_Since.astype('int')
bd_test['Active_Since'] = 2023 - bd_test.Active_Since.astype('int')

In [8]:
bd_train.dtypes

Medicine_ID             object
Counterfeit_Weight     float64
DistArea_ID             object
Active_Since             int32
Medicine_MRP           float64
Medicine_Type           object
SidEffect_Level         object
Availability_rating    float64
Area_Type               object
Area_City_Type          object
Area_dist_level         object
Counterfeit_Sales      float64
dtype: object

In [9]:
bd_train[['Counterfeit_Weight','Medicine_MRP','Availability_rating']].dtypes

Counterfeit_Weight     float64
Medicine_MRP           float64
Availability_rating    float64
dtype: object

In [10]:
bd_train.head()

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.1,Area046,28,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,40,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,LJC15,9.025,Area046,28,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,GWC40,11.8,Area046,28,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713
4,QMN13,,Area019,40,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402


In [11]:
pipe1 = pdPipeline([
    ('columns_selection',VarSelector(['DistArea_ID','Medicine_Type','SidEffect_Level','Area_Type','Area_City_Type','Area_dist_level'])),
    ('data_impute',DataFrameImputer()),
    ('get_dummy',get_dummies_Pipe())  
])
pipe2 = pdPipeline([
    ('data_impute',DataFrameImputer()),
    ('',VarSelector(['Counterfeit_Weight','Active_Since','Medicine_MRP','Availability_rating']))
])

In [12]:
union_pipe = FeatureUnion([
    ('create dummy',pipe1),
    ('select columns',pipe2)    
])


In [13]:
union_pipe.fit(bd_train)

In [14]:
x_bd_train = pd.DataFrame(data = union_pipe.fit_transform(bd_train) , columns = union_pipe.get_feature_names_out())
x_bd_train

Unnamed: 0,create dummy__DistArea_ID_Area017,create dummy__DistArea_ID_Area046,create dummy__DistArea_ID_Area013,create dummy__DistArea_ID_Area035,create dummy__DistArea_ID_Area049,create dummy__DistArea_ID_Area045,create dummy__DistArea_ID_Area027,create dummy__DistArea_ID_Area018,create dummy__DistArea_ID_Area010,create dummy__Medicine_Type_Hreplacements,...,create dummy__Area_Type_CityLimits,create dummy__Area_City_Type_Tier 3,create dummy__Area_City_Type_Tier 2,create dummy__Area_dist_level_Medium,create dummy__Area_dist_level_Unknown,create dummy__Area_dist_level_Small,select columns__Counterfeit_Weight,select columns__Active_Since,select columns__Medicine_MRP,select columns__Availability_rating
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,13.100,28.0,160.2366,0.070422
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,13.800,40.0,110.4384,0.013000
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,9.025,28.0,259.4092,0.060783
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,11.800,28.0,99.9830,0.065555
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,13.800,40.0,56.4402,0.248859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6813,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,8.535,28.0,204.1452,0.112963
6814,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,20.650,28.0,235.1088,0.131103
6815,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,20.000,18.0,193.6292,0.105096
6816,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,10.180,23.0,162.8682,0.099957


In [31]:
x_bd_test = pd.DataFrame(data = union_pipe.transform(bd_test) , columns = union_pipe.get_feature_names_out())
x_bd_test

Unnamed: 0,create dummy__DistArea_ID_Area017,create dummy__DistArea_ID_Area046,create dummy__DistArea_ID_Area013,create dummy__DistArea_ID_Area035,create dummy__DistArea_ID_Area049,create dummy__DistArea_ID_Area045,create dummy__DistArea_ID_Area027,create dummy__DistArea_ID_Area018,create dummy__DistArea_ID_Area010,create dummy__Medicine_Type_Hreplacements,...,create dummy__Area_Type_CityLimits,create dummy__Area_City_Type_Tier 3,create dummy__Area_City_Type_Tier 2,create dummy__Area_dist_level_Medium,create dummy__Area_dist_level_Unknown,create dummy__Area_dist_level_Small,select columns__Counterfeit_Weight,select columns__Active_Since,select columns__Medicine_MRP,select columns__Availability_rating
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,13.80,40.0,85.5328,0.112747
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,13.45,23.0,257.1460,0.144446
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,7.10,23.0,98.1172,0.144221
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,18.30,27.0,135.3730,0.100388
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,13.80,40.0,112.8016,0.022585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1700,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,13.80,40.0,136.5704,0.050505
1701,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,21.30,21.0,57.0744,0.041118
1702,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,20.40,18.0,182.7422,0.191273
1703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,20.00,16.0,266.9672,0.013000


In [32]:
y_bd_train = bd_train['Counterfeit_Sales']

In [33]:
X_train,X_test,Y_train,Y_test = train_test_split(x_bd_train, y_bd_train, test_size=0.2,random_state= 32)

In [34]:
xgb_reg_model = XGBRegressor()

param_grid = {
    'max_depth': [3, 4, 5, 10, 15],
    'n_estimators': [100, 150, 200, 400, 500],
    'learning_rate': [0.01, 0.1, 0.2, .001],
    'reg_alpha': [0, 0.5, 1, 0.25],
    'reg_lambda': [1, 1.5, 2, 1.25],
    'gamma': [0.1, 0.2, .05]
}
## You can use GridSearchCV for better search (however it would be slow)
xgb_search = RandomizedSearchCV(xgb_reg_model,param_distributions=param_grid,
                                    cv = 5,
                                    scoring='neg_mean_squared_error',
                                    verbose=1)



In [35]:
xgb_search.fit(X_train,Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [36]:
Y_predict_train = xgb_search.predict(X_train)

In [37]:
Y_predict = xgb_search.predict(X_test)

In [38]:
def score(data,model, output):
    predict = xgb_search.predict(data)
    mse = mean_absolute_error(output,predict)
    r2 = r2_score(output, predict)
    return {'score': 1-(mse/1660), 
            'mse': mse, 
            'predict': predict, 
            'Rsquare' : r2}

In [39]:
score(X_train, xgb_search, Y_train)

{'score': 0.5569770590489854,
 'mse': 735.4180819786842,
 'predict': array([2109.3638, 3254.503 , 2634.812 , ..., 2260.2874, 3252.9492,
        3023.7856], dtype=float32),
 'Rsquare': 0.619720577360845}

In [40]:
score(X_test, xgb_search, Y_test)

{'score': 0.562641691991266,
 'mse': 726.0147912944984,
 'predict': array([1993.5685, 2069.4172, 2023.8237, ..., 3116.4539, 1694.4031,
        2010.6492], dtype=float32),
 'Rsquare': 0.618745079069808}

In [41]:
rf_reg_model = RandomForestRegressor()

rf_reg_params = {
    'n_estimators':[100,200,300,500], #number of individual decision trees to be created
    'max_features': ['auto','sqrt'], #how many features would be available at a split
    'bootstrap': [True], #should different data subsets go in or not
    'max_depth':[2, 5,10,15,20],
    'min_samples_leaf':[5,10,15,20],
    'min_samples_split':[5,10,15,20]
}

rf_reg_random_search = RandomizedSearchCV(rf_reg_model,param_distributions=rf_reg_params,
                                             cv = 5,
                                    n_iter=50,
                                    scoring='neg_mean_squared_error',
                                    n_jobs=-1,verbose=1)

rf_reg_random_search.fit(X_train,Y_train)
report(rf_reg_random_search.cv_results_,3)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Model with rank: 1
Mean validation score: -1223316.968687 (std: 53396.172427)
Parameters: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 15, 'bootstrap': True}

Model with rank: 2
Mean validation score: -1228506.063574 (std: 57660.261667)
Parameters: {'n_estimators': 500, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 15, 'bootstrap': True}

Model with rank: 3
Mean validation score: -1237997.414159 (std: 57845.261394)
Parameters: {'n_estimators': 300, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 15, 'bootstrap': True}



In [42]:
rf_reg = rf_reg_random_search.best_estimator_

In [43]:
rf_reg.fit(X_train,Y_train)

In [44]:
score(X_test, rf_reg, Y_test)

{'score': 0.562641691991266,
 'mse': 726.0147912944984,
 'predict': array([1993.5685, 2069.4172, 2023.8237, ..., 3116.4539, 1694.4031,
        2010.6492], dtype=float32),
 'Rsquare': 0.618745079069808}

In [50]:
temp = bd_test['Medicine_ID']

In [48]:
transformed_data_sub = pd.DataFrame(data = transformed_data_sub, columns= ['Counterfeit_Sales'])

NameError: name 'transformed_data_sub' is not defined

In [46]:
submissions = pd.concat([temp,transformed_data_sub],axis = 1)

NameError: name 'transformed_data_sub' is not defined

In [47]:
submissions

NameError: name 'submissions' is not defined

In [None]:
submissions.to_csv('D:\Pryanka\Edvancer\xg_submission.csv',index=False)

In [None]:
submissions.info()

In [None]:
Lm_1 = LinearRegression()
Lm_1.fit(X_train,Y_train) 

In [None]:
def model_func(x, a, b, c):
    return a*np.power(c, x) + b

In [None]:
popt, pcov = curve_fit(model_func,np.array(Y_predict)[:,0],np.array(Y_test)[:,0])

In [None]:
a_fit, b_fit, c_fit = popt

In [None]:
print("Optimized parameters (a, b, c):", a_fit, b_fit, c_fit)

In [None]:
# Transform the data
transformed_data = np.array(a_fit* (c_fit)**Y_predict + b_fit )
# # _
#transformed_data = np.log(yy - 5.822805670508793e-06) / np.log(1.000323950745208) - np.log(862.40217299746 ) / np.log(1.000323950745208)

print("Original data:", Y_test)
print("Transformed data:", transformed_data)


In [None]:
transformed_data =  pd.DataFrame(data = transformed_data , columns = ['Predicted Counterfeit_Sales'])

In [None]:
sb.jointplot(data = pd.concat([Y_test,transformed_data],axis  = 1 ), y = 'Actual Counterfeit_Sales' , x ='Predicted Counterfeit_Sales')

In [None]:
r2_score(Y_test,transformed_data)

In [None]:
score = 1-(mean_absolute_error(Y_test,transformed_data)/1660)
score

In [None]:
ir = IsotonicRegression(out_of_bounds="clip")
ir.fit(np.array(transformed_data)[:,0],np.array(Y_test)[:,0])

In [None]:
Y_ = ir.predict(transformed_data)
Y_predict =  pd.DataFrame(data = Y_ , columns = ['Predicted Counterfeit_Sales'])

In [None]:
sb.jointplot(data = pd.concat([Y_test,Y_predict],axis  = 1 ), x = 'Actual Counterfeit_Sales' , y ='Predicted Counterfeit_Sales')

In [None]:
r2_score(Y_test,Y_predict)

In [None]:
score = 1-(mean_absolute_error(Y_test,Y_predict)/1660)
score

In [None]:
##my code

In [None]:
#bd_train['Medicine_ID'].nunique()

In [None]:
#bd_train.shape

In [None]:
#for col in [ 'Medicine_ID', 'Counterfeit_Weight']:
  #  bd_train.drop(col,axis=1,inplace=True)
   # bd_test.drop(col,axis=1,inplace=True)

In [None]:
#for col in ['Medicine_Type','SidEffect_Level','Area_Type','Area_City_Type','Area_dist_level',"DistArea_ID"]:  
 #   temp=pd.get_dummies(bd_train[col],prefix=col,drop_first=True)
  #  bd_train=pd.concat([temp,bd_train],axis=1)
   # bd_train.drop([col],axis=1,inplace=True)
       
    #temp=pd.get_dummies(bd_test[col],prefix=col,drop_first=True)
    #bd_test=pd.concat([temp,bd_test],axis=1)
    #bd_test.drop([col],axis=1,inplace=True)

In [None]:
#bd_train.shape

In [None]:
#target='Counterfeit_Sales'

In [None]:
#x_train=bd_train.drop(target,axis=1)
#y_train=bd_train[target]

In [None]:
#y_bd_train = pd.DataFrame(data = np.array(bd_train.Counterfeit_Sales) , columns = ['Actual Counterfeit_Sales'])

In [None]:
#X_train,X_test,Y_train,Y_test = train_test_split(x_bd_train, y_bd_train, test_size=0.2,random_state= 32)

In [None]:
#Y_test =  pd.DataFrame(data = np.array(Y_test) , columns = ['Actual Counterfeit_Sales'])
#Y_test

In [None]:
#model.score(X_train,y_train)

In [None]:
#params={'alpha':np.linspace(0.1,100,50)}

In [None]:
#model=Lasso(fit_intercept=True)

In [None]:
#grid_search=GridSearchCV(model,cv=10,param_grid=params,n_jobs=-1,verbose=10,
                         scoring='neg_mean_absolute_error')

In [None]:
#grid_search.fit(x_train,y_train)

In [None]:
#submissions=pd.DataFrame({target:grid_search.predict(bd_test)})

In [None]:
#submissions.to_csv('D:/Pryanka/Edvancer/Pharma_submission.csv',index=False)