### Imports

In [1]:
# Importing essential libraries

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)

### Load in the Data and Examine

In [2]:
# %%HTML
# <style type="text/css">
# table.dataframe td, table.dataframe th {
#     border: 1px  black solid !important;
#   color: black !important;
# }
# </style>

In [3]:
file_train=r'D:\Machine Learning\Python Projects\Project-3\counterfeit_train.csv'
file_test=r'D:\Machine Learning\Python Projects\Project-3\counterfeit_test.csv'

cf_train=pd.read_csv(file_train)
cf_test=pd.read_csv(file_test)

print('shape of train data: {}'.format(cf_train.shape))
print('shape of test data: {}'.format(cf_test.shape))

shape of train data: (6818, 12)
shape of test data: (1705, 11)


In [4]:
# cf_test does not have the outcome variable 
# In order to combine the two we will add the response variable to cf_test
cf_test['Counterfeit_Sales']=np.nan

Combine the two datasets to pre-process the data together since the data on which the model is trained should undergo the same preprocessing as the data on which the predictions are made

In [5]:
# creating identifier column for train and test
cf_train['data']='train'
cf_test['data']='test'

# the columns in the two data frames should be in the same order to enable concatenation
cf_test=cf_test[cf_train.columns] 

cf_all=pd.concat([cf_train,cf_test],axis=0)
cf_all.head()

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales,data
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026,train
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152,train
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092,train
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713,train
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402,train


In [6]:
cf=cf_all.copy()

# Returns the number of Rows and Columns of overall data[both train and test]
print('shape of overall data: {}'.format(cf_all.shape))

shape of overall data: (8523, 13)


### After analyzing each feature

In [7]:
# 'Medicine_ID'---> Drop this column
# 'Counterfeit_Weight'---> Replace missing values with median
# 'DistArea_ID' ---> create dummies
# 'Medicine_Type'---> create dummies
# 'SidEffect_Level' --->create dummies
# 'Area_Type'--->create dummies
# 'Area_City_Type'---> create dummies
# 'Area_dist_level'---> create dummies

In [8]:
### Categorical Features
categorical_features=[feature for feature in cf.columns if cf[feature].dtypes=='O']
categorical_features


['Medicine_ID',
 'DistArea_ID',
 'Medicine_Type',
 'SidEffect_Level',
 'Area_Type',
 'Area_City_Type',
 'Area_dist_level',
 'data']

In [9]:
print('Before creating dummies: {}'.format(cf.shape))

Before creating dummies: (8523, 13)


In [10]:
for col in categorical_features[:-1]:
    if col not in 'Medicine_ID':
        freqs=cf[col].value_counts()
        k=freqs.index[freqs>100][:-1]
        for cat in k:
            name=col+'_'+cat
            cf[name]=(cf[col]==cat).astype(int)
        del cf[col]
        print(col)

DistArea_ID
Medicine_Type
SidEffect_Level
Area_Type
Area_City_Type
Area_dist_level


In [11]:
print('Ater creating dummies: {}'.format(cf.shape))

Ater creating dummies: (8523, 39)


In [12]:
cf.dtypes

Medicine_ID                          object
Counterfeit_Weight                  float64
Active_Since                          int64
Medicine_MRP                        float64
Availability_rating                 float64
Counterfeit_Sales                   float64
data                                 object
DistArea_ID_Area027                   int32
DistArea_ID_Area013                   int32
DistArea_ID_Area046                   int32
DistArea_ID_Area049                   int32
DistArea_ID_Area035                   int32
DistArea_ID_Area045                   int32
DistArea_ID_Area018                   int32
DistArea_ID_Area017                   int32
DistArea_ID_Area010                   int32
Medicine_Type_Antibiotics             int32
Medicine_Type_Hreplacements           int32
Medicine_Type_Antiseptics             int32
Medicine_Type_OralContraceptives      int32
Medicine_Type_Antipyretics            int32
Medicine_Type_Cardiac                 int32
Medicine_Type_Mstablizers       

### Missing value

In [13]:
feature_with_nan=[feature for feature in cf.columns if cf[feature].isnull().sum()>1]
print('feature: {}'.format(feature_with_nan))

for feature in feature_with_nan:
    print('Before treating {}: count {} values,{}%  Missing values'.format(feature,cf[feature].isnull().sum(),
                                                          np.round(cf[feature].isnull().mean(),3)*100))

feature: ['Counterfeit_Weight', 'Counterfeit_Sales']
Before treating Counterfeit_Weight: count 1463 values,17.2%  Missing values
Before treating Counterfeit_Sales: count 1705 values,20.0%  Missing values


In [14]:
# 'Counterfeit_Sales' has missing values because we created in the test data while combining.
#  only 'Counterfeit_Weight' has missing values, Replace them with median
cf.loc[cf['Counterfeit_Weight'].isnull(),'Counterfeit_Weight']=cf.loc[
    cf['data']=='train','Counterfeit_Weight'].median()

In [15]:
for feature in feature_with_nan:
    print('After treating {}: count {} values,{}% Missing values'.format(feature,cf[feature].isnull().sum(),
                                                          np.round(cf[feature].isnull().mean(),3)*100))

After treating Counterfeit_Weight: count 0 values,0.0% Missing values
After treating Counterfeit_Sales: count 1705 values,20.0% Missing values


In [16]:
# Data preprocessing is complete - the data is in the expected format

In [17]:
cf_copy=cf.copy()

In [18]:
cf_copy.isnull().sum()

Medicine_ID                            0
Counterfeit_Weight                     0
Active_Since                           0
Medicine_MRP                           0
Availability_rating                    0
Counterfeit_Sales                   1705
data                                   0
DistArea_ID_Area027                    0
DistArea_ID_Area013                    0
DistArea_ID_Area046                    0
DistArea_ID_Area049                    0
DistArea_ID_Area035                    0
DistArea_ID_Area045                    0
DistArea_ID_Area018                    0
DistArea_ID_Area017                    0
DistArea_ID_Area010                    0
Medicine_Type_Antibiotics              0
Medicine_Type_Hreplacements            0
Medicine_Type_Antiseptics              0
Medicine_Type_OralContraceptives       0
Medicine_Type_Antipyretics             0
Medicine_Type_Cardiac                  0
Medicine_Type_Mstablizers              0
Medicine_Type_Tranquilizers            0
Medicine_Type_An

Let's separate our two data sets and remove the unnecessary columns that we added while combining them.

In [19]:
#Separate the train and test data
cf_train=cf_copy[cf_copy['data']=='train']
del cf_train['data']

cf_test=cf_copy[cf_copy['data']=='test']
cf_test.drop(['data','Counterfeit_Sales'],axis=1,inplace=True)

In [20]:
print('shape of train data after feature engineering: {}'.format(cf_train.shape))
print('shape of test data after feature engineering: {}'.format(cf_test.shape))

shape of train data after feature engineering: (6818, 38)
shape of test data after feature engineering: (1705, 37)


### Feature Scaling

In [21]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [22]:
feature_scale_train=[feature for feature in cf_train.columns if feature not in ['Medicine_ID','Counterfeit_Sales']]

In [23]:
feature_scale_train

['Counterfeit_Weight',
 'Active_Since',
 'Medicine_MRP',
 'Availability_rating',
 'DistArea_ID_Area027',
 'DistArea_ID_Area013',
 'DistArea_ID_Area046',
 'DistArea_ID_Area049',
 'DistArea_ID_Area035',
 'DistArea_ID_Area045',
 'DistArea_ID_Area018',
 'DistArea_ID_Area017',
 'DistArea_ID_Area010',
 'Medicine_Type_Antibiotics',
 'Medicine_Type_Hreplacements',
 'Medicine_Type_Antiseptics',
 'Medicine_Type_OralContraceptives',
 'Medicine_Type_Antipyretics',
 'Medicine_Type_Cardiac',
 'Medicine_Type_Mstablizers',
 'Medicine_Type_Tranquilizers',
 'Medicine_Type_Analgesics',
 'Medicine_Type_Antimalarial',
 'Medicine_Type_Antacids',
 'Medicine_Type_Statins',
 'Medicine_Type_MuscleRelaxants',
 'Medicine_Type_Antifungal',
 'SidEffect_Level_mild',
 'Area_Type_DownTown',
 'Area_Type_MidTownResidential',
 'Area_Type_CityLimits',
 'Area_City_Type_Tier 3',
 'Area_City_Type_Tier 2',
 'Area_dist_level_Medium',
 'Area_dist_level_Unknown',
 'Area_dist_level_Small']

In [24]:
scaler.fit(cf_train[feature_scale_train])

MinMaxScaler()

In [25]:
# transform the train and add on the Id and SalePrice variables
scaled_train=pd.DataFrame(scaler.transform(cf_train[feature_scale_train]),columns=feature_scale_train)

data_train=pd.concat([cf_train[['Medicine_ID','Counterfeit_Sales']].reset_index(drop=True),
                     scaled_train],axis=1)

In [26]:
data_train.to_csv('train.csv',index=False)

#### test data

In [27]:
feature_scale_test=[feature for feature in cf_test.columns if feature not in ['Medicine_ID']]

In [28]:
# transform the test and add on the Id variables
scaled_test=pd.DataFrame(scaler.transform(cf_test[feature_scale_test]),columns=feature_scale_test)

data_test=pd.concat([cf_test[['Medicine_ID']].reset_index(drop=True),
                   scaled_test],axis=1)

In [31]:
data_test.to_csv('test.csv',index=False)

In [32]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6818 entries, 0 to 6817
Data columns (total 38 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Medicine_ID                       6818 non-null   object 
 1   Counterfeit_Sales                 6818 non-null   float64
 2   Counterfeit_Weight                6818 non-null   float64
 3   Active_Since                      6818 non-null   float64
 4   Medicine_MRP                      6818 non-null   float64
 5   Availability_rating               6818 non-null   float64
 6   DistArea_ID_Area027               6818 non-null   float64
 7   DistArea_ID_Area013               6818 non-null   float64
 8   DistArea_ID_Area046               6818 non-null   float64
 9   DistArea_ID_Area049               6818 non-null   float64
 10  DistArea_ID_Area035               6818 non-null   float64
 11  DistArea_ID_Area045               6818 non-null   float64
 12  DistAr