# Imports
- Python version: '3.8.5'
- numpy version: '1.19.2'
- pandas version: '1.2.3'
- tensorflow version: '2.3.0'

In [1]:
import numpy as np
import pandas as pd

from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.callbacks import Callback

# Useful Functions

- Used for auto-encoding the dummies to reduce the no of dimensions
- Used tensorflow to implement this deeplearning technique
- The callback is required to automatically stop the training once accuracy = 1 is reached

In [2]:
class My_callback(Callback):
    def on_epoch_end(self, epoch, logs = {}):
        if logs.get("accuracy") == 1.0:
            print("\nReached Accuracy = 1")
            print("Hence training stopped!\n")
            self.model.stop_training = True

my_callback = My_callback()

In [3]:
def auto_emb(temp, emb_dim, colname, epochs = 100):
    dense_dim = (temp.shape[1] - emb_dim)//2
    inputs=Input(shape=(temp.shape[1],))
    dense1=Dense(dense_dim,activation='relu')(inputs)
    embedded_output=Dense(emb_dim)(dense1)
    outputs=Dense(temp.shape[1],activation='softmax')(embedded_output)
    model=Model(inputs=inputs,outputs=outputs)
    embedder=Model(inputs=inputs,outputs=embedded_output)
    
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
    model.fit(temp,temp,epochs=epochs ,batch_size=100, callbacks = [my_callback])
    
    cols = [colname+"_emb"+str(i+1) for i in range(emb_dim)]
    
    return(pd.DataFrame(embedder.predict(temp),columns= cols))

- Used to get dummies
- this is used over pandas.get_dummies to drop columns of our choice

In [4]:
def getting_dummies(dat,var,freq = 50, drp_flg = True, drp = []):
	data= dat[dat.columns]
	tab = data[var].value_counts(ascending = False)
	tab = tab.index[tab>=freq][:-1].tolist() if drp_flg else tab.index[tab>=freq].tolist()
	for i in drp: tab.remove(i)
	print("\n\n--------------------\n--------------------\nDummies for",var)
	print("no of new columns from",var,"is",len(tab))
	print(tab)
	to_return_df = pd.DataFrame({"delthis":np.arange(data.shape[0])})#new dummies dataframe
	for i in tab:
		j = str(i)
		j = j.replace(" ","_")
		namee = var+"_"+j
		to_return_df[namee]=(data[var]==i).astype(int)
	to_return_df.drop(["delthis"],axis=1,inplace=True)
	return to_return_df

# loading the data

In [5]:
cct = pd.read_csv(r"data/counterfeit_train.csv")

In [6]:
cct

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.100,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013000,CityLimits,Tier 3,Medium,3069.1520
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.0920
3,GWC40,11.800,Area046,1995,99.9830,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.7130
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402
...,...,...,...,...,...,...,...,...,...,...,...,...
6813,OYN80,8.535,Area046,1995,204.1452,Hreplacements,mild,0.112963,DownTown,Tier 1,Small,2070.4520
6814,ACW12,20.650,Area046,1995,235.1088,Hreplacements,mild,0.131103,DownTown,Tier 1,Small,2126.3792
6815,OPM10,20.000,Area017,2005,193.6292,Antimalarial,critical,0.105096,DownTown,Tier 2,Unknown,2119.7212
6816,SLY12,10.180,Area045,2000,162.8682,Statins,mild,0.099957,DownTown,Tier 2,Unknown,1485.2138


# Handling Categorical Features

- For 'SidEffect_Level', 'mild' is major class
- Hence mild is 'assigned' 1 and 'critical' is assigned 0

In [7]:
cct["SidEffect_Level"] = (cct["SidEffect_Level"] == "mild").astype(int)

- Make dummies for "Area_Type" and drop the category with the lowest frequency

In [8]:
temp1 = getting_dummies(cct, "Area_Type", freq = 0)
cct = pd.concat([cct, temp1], axis = 1)



--------------------
--------------------
Dummies for Area_Type
no of new columns from Area_Type is 3
['DownTown', 'MidTownResidential', 'CityLimits']


- Make dummies for "Area_City_Type" and drop the category with the lowest frequency

In [9]:
temp1 = getting_dummies(cct, "Area_City_Type", freq = 0)
cct = pd.concat([cct, temp1], axis = 1)



--------------------
--------------------
Dummies for Area_City_Type
no of new columns from Area_City_Type is 2
['Tier 3', 'Tier 2']


- Make dummies for "Area_dist_level" and drop the category 'Unknown'

In [10]:
temp1 = getting_dummies(cct, "Area_dist_level", freq = 0, drp_flg = False, drp = ["Unknown"])
cct = pd.concat([cct, temp1], axis = 1)



--------------------
--------------------
Dummies for Area_dist_level
no of new columns from Area_dist_level is 3
['Medium', 'Small', 'High']


- Make dummies for "Medicine_Type" without dropping anything
- Auto-encode the dummies to reduce their dimensions from 16 to 3 wihtout any loss of information
- This is achieved by training the auto-encoder model to get accuracy = 1

In [11]:
temp1 = getting_dummies(cct, "Medicine_Type", freq = 0, drp_flg = False)
temp = auto_emb(temp1, emb_dim=3, colname="Medicine_Type")
cct = pd.concat([cct, temp], axis = 1)



--------------------
--------------------
Dummies for Medicine_Type
no of new columns from Medicine_Type is 16
['Hreplacements', 'Antibiotics', 'Antiseptics', 'OralContraceptives', 'Antipyretics', 'Cardiac', 'Mstablizers', 'Tranquilizers', 'Analgesics', 'Antimalarial', 'Antacids', 'Statins', 'MuscleRelaxants', 'Antifungal', 'Stimulants', 'Antiviral']
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
 1/69 [..............................] - ETA: 0s - loss: 0.0227 - accuracy: 1.0000
Reached Accuracy = 1
Hence training stopped!



- Make dummies for "Active_Since" without dropping anything
- Auto-encode the dummies to reduce their dimensions from 9 to 3 wihtout any loss of information
- This is achieved by training the auto-encoder model to get accuracy = 1

- Though dummies were made for "Active_Since" for exploratory purposes, its best to use it as numerical variable

In [12]:
temp1 = getting_dummies(cct, "Active_Since", freq = 0, drp_flg = False)
temp = auto_emb(temp1, emb_dim=3, colname="Active_Since")
cct = pd.concat([cct, temp], axis = 1)



--------------------
--------------------
Dummies for Active_Since
no of new columns from Active_Since is 9
[1983, 2005, 1985, 1995, 2002, 1997, 2000, 2007, 1996]
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
 1/69 [..............................] - ETA: 0s - loss: 0.5015 - accuracy: 1.0000
Reached Accuracy = 1
Hence training stopped!



- Make dummies for "Active_Since" without dropping anything
- Auto-encode the dummies to reduce their dimensions from 10 to 3 wihtout any loss of information
- This is achieved by training the auto-encoder model to get accuracy = 1

* 'DistArea_ID' is highly correlated with 'Area_Type', 'Area_City_Type', 'Area_dist_level'
* The dummies from these columns are kept for modelling process to see which gives the best results
* Theoritically 'DistArea_ID' should be enough to represent the other three columns

In [13]:
temp1 = getting_dummies(cct, "DistArea_ID", freq = 0, drp_flg = False)
temp = auto_emb(temp1, emb_dim=3, colname="DistArea_ID")
cct = pd.concat([cct, temp], axis = 1)



--------------------
--------------------
Dummies for DistArea_ID
no of new columns from DistArea_ID is 10
['Area017', 'Area013', 'Area046', 'Area035', 'Area049', 'Area045', 'Area027', 'Area018', 'Area010', 'Area019']
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Reached Accuracy = 1
Hence training stopped!



# Imputing Missing Values

- Counterfeit_Weight's missing values can be classified as MAR (Missing At Random)
- Medicine_ID is correlated to Counterfeit_Weight
- Hence the mode of Counterfeit_Weight for each Medicine_ID is used to fill the missing values of Counterfeit_Weight
- This leavs us with only 11 missing values in Counterfeit_Weight which is later imputed with median

In [14]:
counter_groupby = cct[["Medicine_ID",'Counterfeit_Weight']].groupby(["Medicine_ID", 'Counterfeit_Weight']).agg(lambda x:x.value_counts().index[0])
for i in counter_groupby:
    cct.loc[((cct["Medicine_ID"] == i[0]) & (cct["Counterfeit_Weight"].isnull() == True)) ,['Counterfeit_Weight']] = i[1]

# Drop
- Drop the certainly informationless columns

In [15]:
cct.drop(["Medicine_ID", "Medicine_Type", "Area_Type", "Area_City_Type", "Area_dist_level", "DistArea_ID"], axis = 1, inplace = True)

# Rewrite Clean Data
- write this clean data out again to use it for different modelling purposes

In [16]:
cct.to_csv(r"data/cct_edvance_cleanp3.csv", index = False)

# Note
- This script deliberately omits all the the data testing codes and graphs
- The code representation of data exploration would make the script lenthy and messy
- Anymore exploratory findings are Welcome!