In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
train_data=r"G:\edvancer\ML python\project\p3\counterfeit_train.csv"
test_data=r"G:\edvancer\ML python\project\p3\counterfeit_test.csv"

In [3]:
#reading the train and test data CSV file
train=pd.read_csv(train_data)
test=pd.read_csv(test_data)

In [4]:
#creating dummy column in both train and test data
train['data']='train'
test['data']='test'

In [5]:
#clubbing both train and test data
all_data=pd.concat([train,test],axis=0)

In [6]:
list(zip(all_data.columns,all_data.isnull().sum(),all_data.nunique(),all_data.dtypes) )

[('Active_Since', 0, 9, dtype('int64')),
 ('Area_City_Type', 0, 3, dtype('O')),
 ('Area_Type', 0, 4, dtype('O')),
 ('Area_dist_level', 0, 4, dtype('O')),
 ('Availability_rating', 0, 7884, dtype('float64')),
 ('Counterfeit_Sales', 1705, 3142, dtype('float64')),
 ('Counterfeit_Weight', 1463, 415, dtype('float64')),
 ('DistArea_ID', 0, 10, dtype('O')),
 ('Medicine_ID', 0, 1557, dtype('O')),
 ('Medicine_MRP', 0, 5970, dtype('float64')),
 ('Medicine_Type', 0, 16, dtype('O')),
 ('SidEffect_Level', 0, 2, dtype('O')),
 ('data', 0, 2, dtype('O'))]

In [7]:
char_cols=all_data.select_dtypes(['object']).columns

In [8]:
char_cols=char_cols[:-1]

In [9]:
char_cols

Index(['Area_City_Type', 'Area_Type', 'Area_dist_level', 'DistArea_ID',
       'Medicine_ID', 'Medicine_Type', 'SidEffect_Level'],
      dtype='object')

In [10]:
#ignored 'Medicine_ID' as it has large number unique values
char_cols=['Area_City_Type', 'Area_Type', 'Area_dist_level', 'DistArea_ID', 'Medicine_Type', 'SidEffect_Level']

In [11]:
#creating dummies for the categorical variables
for col in char_cols:
    k=all_data[col].value_counts()
    cats=k.index[k>100]
    cats=cats[:-1]
    
    for cat in cats:
        name=col+'_'+cat
        all_data[name]=(all_data[col]==cat).astype(int)
    del all_data[col]

In [12]:
all_data.head()

Unnamed: 0,Active_Since,Availability_rating,Counterfeit_Sales,Counterfeit_Weight,Medicine_ID,Medicine_MRP,data,Area_City_Type_Tier 3,Area_City_Type_Tier 2,Area_Type_DownTown,...,Medicine_Type_Cardiac,Medicine_Type_Mstablizers,Medicine_Type_Tranquilizers,Medicine_Type_Analgesics,Medicine_Type_Antimalarial,Medicine_Type_Antacids,Medicine_Type_Statins,Medicine_Type_MuscleRelaxants,Medicine_Type_Antifungal,SidEffect_Level_mild
0,1995,0.070422,1775.5026,13.1,RRA15,160.2366,train,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,1983,0.013,3069.152,,YVV26,110.4384,train,1,0,0,...,0,1,0,0,0,0,0,0,0,1
2,1995,0.060783,2603.092,9.025,LJC15,259.4092,train,0,0,1,...,1,0,0,0,0,0,0,0,0,1
3,1995,0.065555,1101.713,11.8,GWC40,99.983,train,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,1983,0.248859,158.9402,,QMN13,56.4402,train,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
k=train.groupby(['Medicine_ID'])['Counterfeit_Weight'].mean()
k=pd.DataFrame(k)
k.reset_index(inplace=True)
k.rename({'Counterfeit_Weight':'cw_temp'},inplace=True,axis=1)
all_data=pd.merge(all_data,k,on='Medicine_ID',how='left')

In [14]:
del all_data['Counterfeit_Weight']

In [15]:
all_data['cw_temp'].isnull().sum()

30

In [16]:
del all_data['Medicine_ID']

In [17]:
all_data.loc[all_data['cw_temp'].isnull(),'cw_temp']=all_data.loc[all_data['data']=='train','cw_temp'].mean()

In [18]:
#splitting into train and test data 
train=all_data[all_data['data']=='train']
del train['data']
test=all_data[all_data['data']=='test']
test.drop(['data'],axis=1,inplace=True)

In [19]:
test.head()

Unnamed: 0,Active_Since,Availability_rating,Counterfeit_Sales,Medicine_MRP,Area_City_Type_Tier 3,Area_City_Type_Tier 2,Area_Type_DownTown,Area_Type_MidTownResidential,Area_Type_CityLimits,Area_dist_level_Medium,...,Medicine_Type_Mstablizers,Medicine_Type_Tranquilizers,Medicine_Type_Analgesics,Medicine_Type_Antimalarial,Medicine_Type_Antacids,Medicine_Type_Statins,Medicine_Type_MuscleRelaxants,Medicine_Type_Antifungal,SidEffect_Level_mild,cw_temp
6818,1983,0.112747,,85.5328,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1,17.4
6819,2000,0.144446,,257.146,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,13.45
6820,2000,0.144221,,98.1172,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,7.1
6821,1996,0.100388,,135.373,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,18.3
6822,1983,0.022585,,112.8016,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,10.19


In [20]:
train.head()

Unnamed: 0,Active_Since,Availability_rating,Counterfeit_Sales,Medicine_MRP,Area_City_Type_Tier 3,Area_City_Type_Tier 2,Area_Type_DownTown,Area_Type_MidTownResidential,Area_Type_CityLimits,Area_dist_level_Medium,...,Medicine_Type_Mstablizers,Medicine_Type_Tranquilizers,Medicine_Type_Analgesics,Medicine_Type_Antimalarial,Medicine_Type_Antacids,Medicine_Type_Statins,Medicine_Type_MuscleRelaxants,Medicine_Type_Antifungal,SidEffect_Level_mild,cw_temp
0,1995,0.070422,1775.5026,160.2366,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,13.1
1,1983,0.013,3069.152,110.4384,1,0,0,0,1,1,...,1,0,0,0,0,0,0,0,1,7.45
2,1995,0.060783,2603.092,259.4092,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,9.025
3,1995,0.065555,1101.713,99.983,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,11.8
4,1983,0.248859,158.9402,56.4402,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,8.93


In [21]:
del test['Counterfeit_Sales']

In [22]:
train.shape

(6818, 37)

In [24]:
ci_train,ci_test=train_test_split(train,test_size=0.2,random_state=2)

In [25]:
ci_train.reset_index(drop=True,inplace=True)
ci_test.reset_index(drop=True,inplace=True)

In [26]:
x_train=ci_train.drop('Counterfeit_Sales',axis=1)
x_test=ci_test.drop('Counterfeit_Sales',axis=1)

y_train=ci_train['Counterfeit_Sales']
y_test=ci_test['Counterfeit_Sales']

In [27]:
clf = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [28]:
clf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [29]:
from sklearn.metrics import r2_score
y_pred = clf.predict(x_test)
r2_score(y_test, y_pred)

0.5182248515097658

In [30]:
test_pred=clf.predict(test)

In [32]:
pd.DataFrame(test_pred).to_csv("mysubmission.csv",index=True)