In [1]:
import os
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv("C:/Users/RAKESH/Documents/Python/counterfeit_train.csv")
test = pd.read_csv("C:/Users/RAKESH/Documents/Python/counterfeit_test.csv")

In [3]:
train.shape, test.shape

((6818, 12), (1705, 11))

In [4]:
train.dtypes

Medicine_ID             object
Counterfeit_Weight     float64
DistArea_ID             object
Active_Since             int64
Medicine_MRP           float64
Medicine_Type           object
SidEffect_Level         object
Availability_rating    float64
Area_Type               object
Area_City_Type          object
Area_dist_level         object
Counterfeit_Sales      float64
dtype: object

In [5]:
train.head()

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402


In [6]:
train['SidEffect_Level'].value_counts()

mild        4434
critical    2384
Name: SidEffect_Level, dtype: int64

In [7]:
train['sideffect']=np.where(train['SidEffect_Level']=='mild',1,0)
train=train.drop('SidEffect_Level',axis=1)
test['sideffect']=np.where(test['SidEffect_Level']=='mild',1,0)
test=test.drop('SidEffect_Level',axis=1)

In [8]:
train["Area_Type"].value_counts()

DownTown              4481
MidTownResidential     873
CityLimits             735
Industrial             729
Name: Area_Type, dtype: int64

In [9]:
train['DT']=np.where(train['Area_Type']=='DownTown',1,0)
train['MTR']=np.where(train['Area_Type']=='MidTownResidential',1,0)
train['AreaName']=np.where(train['Area_Type']=='CityLimits',1,0)
train=train.drop('Area_Type',axis=1)

test['DT']=np.where(test['Area_Type']=='DownTown',1,0)
test['MTR']=np.where(test['Area_Type']=='MidTownResidential',1,0)
test['AreaName']=np.where(test['Area_Type']=='CityLimits',1,0)
test=test.drop('Area_Type',axis=1)

In [10]:
train.shape, test.shape

((6818, 14), (1705, 13))

In [11]:
train.dtypes

Medicine_ID             object
Counterfeit_Weight     float64
DistArea_ID             object
Active_Since             int64
Medicine_MRP           float64
Medicine_Type           object
Availability_rating    float64
Area_City_Type          object
Area_dist_level         object
Counterfeit_Sales      float64
sideffect                int32
DT                       int32
MTR                      int32
AreaName                 int32
dtype: object

In [12]:
train['Area_City_Type'].value_counts()

Tier 3    2655
Tier 2    2244
Tier 1    1919
Name: Area_City_Type, dtype: int64

In [13]:
train['T3']=np.where(train['Area_City_Type']=='Tier 3',1,0)
train['T2']=np.where(train['Area_City_Type']=='Tier 2',1,0)
train=train.drop(['Area_City_Type'],axis=1)

test['T3']=np.where(test['Area_City_Type']=='Tier 3',1,0)
test['T2']=np.where(test['Area_City_Type']=='Tier 2',1,0)
test=test.drop(['Area_City_Type'],axis=1)

In [14]:
train['medium']=np.where(train['Area_dist_level']=='Medium',1,0)
train['unknown']=np.where(train['Area_dist_level']=='Unknown',1,0)
train['small']=np.where(train['Area_dist_level']=='Small',1,0)
train=train.drop('Area_dist_level',axis=1)

test['medium']=np.where(test['Area_dist_level']=='Medium',1,0)
test['unknown']=np.where(test['Area_dist_level']=='Unknown',1,0)
test['small']=np.where(test['Area_dist_level']=='Small',1,0)
test=test.drop('Area_dist_level',axis=1)

In [15]:
cat_col=train.select_dtypes(['object']).columns
cat_col

Index(['Medicine_ID', 'DistArea_ID', 'Medicine_Type'], dtype='object')

In [16]:
train['Medicine_Type'].value_counts()

Hreplacements         976
Antibiotics           970
Antiseptics           724
OralContraceptives    694
Antipyretics          536
Cardiac               522
Mstablizers           514
Tranquilizers         420
Analgesics            356
Antimalarial          339
Antacids              206
Statins               167
MuscleRelaxants       139
Antifungal            111
Stimulants             95
Antiviral              49
Name: Medicine_Type, dtype: int64

In [17]:
Medicine_dummies=pd.get_dummies(train['Medicine_Type'],prefix='Medicine')
Medicine_dummies1=pd.get_dummies(test['Medicine_Type'],prefix='Medicine')

In [18]:
Medicine_dummies.shape,Medicine_dummies1.shape

((6818, 16), (1705, 16))

In [19]:
train=pd.concat([train,Medicine_dummies],axis=1)
train=train.drop(['Medicine_Type','Medicine_Antiviral'],axis=1)

test=pd.concat([test,Medicine_dummies1],axis=1)
test=test.drop(['Medicine_Type','Medicine_Antiviral'],axis=1)

In [20]:
train.shape,test.shape

((6818, 31), (1705, 30))

In [21]:
train=train.drop(['Medicine_ID','DistArea_ID'],axis=1)
test=test.drop(['Medicine_ID','DistArea_ID'],axis=1)

In [22]:
train.isnull().sum()

Counterfeit_Weight             1166
Active_Since                      0
Medicine_MRP                      0
Availability_rating               0
Counterfeit_Sales                 0
sideffect                         0
DT                                0
MTR                               0
AreaName                          0
T3                                0
T2                                0
medium                            0
unknown                           0
small                             0
Medicine_Analgesics               0
Medicine_Antacids                 0
Medicine_Antibiotics              0
Medicine_Antifungal               0
Medicine_Antimalarial             0
Medicine_Antipyretics             0
Medicine_Antiseptics              0
Medicine_Cardiac                  0
Medicine_Hreplacements            0
Medicine_Mstablizers              0
Medicine_MuscleRelaxants          0
Medicine_OralContraceptives       0
Medicine_Statins                  0
Medicine_Stimulants         

In [23]:
train.loc[train["Counterfeit_Weight"].isnull(),"Counterfeit_Weight"]=train.loc[train["Counterfeit_Weight"].notnull(),
                                                                               "Counterfeit_Weight"].mean()


test.loc[test["Counterfeit_Weight"].isnull(),"Counterfeit_Weight"]=train.loc[train["Counterfeit_Weight"].notnull(),
                                                                             "Counterfeit_Weight"].mean()

In [24]:
from sklearn.decomposition import PCA,FactorAnalysis
from sklearn.preprocessing import scale

In [25]:
pca = PCA(n_components=28)

In [26]:
pca.fit(train)

PCA(copy=True, iterated_power='auto', n_components=28, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [27]:
train.head()

Unnamed: 0,Counterfeit_Weight,Active_Since,Medicine_MRP,Availability_rating,Counterfeit_Sales,sideffect,DT,MTR,AreaName,T3,...,Medicine_Antipyretics,Medicine_Antiseptics,Medicine_Cardiac,Medicine_Hreplacements,Medicine_Mstablizers,Medicine_MuscleRelaxants,Medicine_OralContraceptives,Medicine_Statins,Medicine_Stimulants,Medicine_Tranquilizers
0,13.1,1995,160.2366,0.070422,1775.5026,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,14.115057,1983,110.4384,0.013,3069.152,1,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0
2,9.025,1995,259.4092,0.060783,2603.092,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,11.8,1995,99.983,0.065555,1101.713,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,14.115057,1983,56.4402,0.248859,158.9402,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [28]:
y=train["Counterfeit_Sales"]

In [29]:
x=train.drop("Counterfeit_Sales",1)

In [30]:
from sklearn.cross_validation import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2,random_state=2)



In [31]:
pca = PCA(n_components=6)


In [32]:
train.shape

(6818, 29)

In [33]:
pca.fit(x_train)




PCA(copy=True, iterated_power='auto', n_components=6, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [34]:
x_train=pca.transform(x_train)
#x_test=pca.transform(x_test)
x_test=pca.transform(x_test)

In [35]:
x_test.shape,x_train.shape,y_train.shape

((1364, 6), (5454, 6), (5454,))

In [36]:
from sklearn.ensemble import RandomForestRegressor
rf1=RandomForestRegressor(n_estimators=100,criterion='mse',max_depth=6,bootstrap=True,min_samples_split=2)

  from numpy.core.umath_tests import inner1d


In [37]:
rf1.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [38]:
pred=rf1.predict(x_test)

In [39]:
a=mean_absolute_error(y_test,pred)

1-(a/1660)

0.5375813911437665

In [41]:
test=pca.transform(test)

In [42]:
prediction=rf1.predict(test)

In [43]:
pred=pd.DataFrame(prediction,columns=["Sales"])

In [44]:
pred.head()

Unnamed: 0,Sales
0,2451.678535
1,4006.152319
2,1512.269125
3,398.285317
4,691.368203


In [46]:
pred.to_csv("shikha_proj3.csv")