### Importing Modules

In [None]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import datasets, linear_model, metrics
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


### Loading the dataset

In [None]:
train = pd.read_csv('../input/hackerearth-carnival-wars-challenge-problem-1/train.csv')
test = pd.read_csv('../input/hackerearth-carnival-wars-challenge-problem-1/test.csv')

In [None]:
train.head()

In [None]:
train.dtypes

#### New Features using datetime from 'instock_date'

In [None]:
# For train data
train['instock_date']= pd.to_datetime(train['instock_date'])
train['year'] = train['instock_date'].dt.year
train['month'] =  train['instock_date'].dt.month
train['quarter'] = train['instock_date'].dt.quarter
train['day of the week'] = train['instock_date'].dt.dayofweek # Monday = 0 & Sunday = 6
train['Dayofyear'] = train['instock_date'].dt.dayofyear
train["hour"] = train['instock_date'].dt.hour


In [None]:
# For test data
test['instock_date']= pd.to_datetime(test['instock_date'])
test['year'] = test['instock_date'].dt.year
test['month'] =  test['instock_date'].dt.month
test['quarter'] = test['instock_date'].dt.quarter
test['day of the week'] = test['instock_date'].dt.dayofweek # Monday = 0 & Sunday = 6
test['Dayofyear'] = test['instock_date'].dt.dayofyear
test["hour"] = test['instock_date'].dt.hour

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.info()

In [None]:
# Finding number of unique values in each columns
for col in train.columns:
  print(col,train[col].nunique())

In [None]:
# Finding number of null values in each column
train.isnull().sum()


In [None]:
# Excluding rows where train['Selling_Price'] is null
train = train[train['Selling_Price'].notna()]


In [None]:
# Replacing null values in train['Customer_name'] with mode
train['Customer_name'] = train['Customer_name'].fillna(train['Customer_name'].mode()[0])



In [None]:
# After applying Label Encoding to categorical columns: train['Customer_name'], train['Loyalty_customer'], train['Product_Category']

from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 

# Encode labels in column 'Customer_name'. 
train['Customer_name']= label_encoder.fit_transform(train['Customer_name']) 

# Encode labels in column 'Loyalty_customer'. 
train['Loyalty_customer']= label_encoder.fit_transform(train['Loyalty_customer']) 

# Encode labels in column 'Product_Category'. 
train['Product_Category']= label_encoder.fit_transform(train['Product_Category']) 





In [None]:
train.head()

In [None]:
train.Stall_no.describe()

In [None]:
# Converting the negative values in "Selling_Price" to its absolute value (positive value)
train['Selling_Price'] = train['Selling_Price'].abs()


In [None]:
sns.boxplot( y="Selling_Price",color='m', data=train)

In [None]:
train.Selling_Price.describe()

In [None]:
train['Loyalty_customer'].unique()

In [None]:
train['Product_Category'].unique()

In [None]:
train.Discount_avail.unique()

In [None]:
# Replacing nan values of other columns using mean , median mode
train['Discount_avail'] = train['Discount_avail'].fillna(train['Discount_avail'].mean())
train['charges_1'] = train['charges_1'].fillna(train['charges_1'].mean())
train['charges_2 (%)'] = train['charges_2 (%)'].fillna(train['charges_2 (%)'].mean())
train['Minimum_price'] = train['Minimum_price'].fillna(train['Minimum_price'].mean())
train['Maximum_price'] = train['Maximum_price'].fillna(train['Maximum_price'].mean())
train['Stall_no'] = train['Stall_no'].fillna(train['Stall_no'].median())


In [None]:
# Rechecking the presence of null values
train.isnull().sum()


In [None]:
# Distplot of Charges_1
sns.distplot(train.charges_1,color='g')
plt.grid('True')
for x in [0.25,0.5,0.75]:
    plt.axvline(train.charges_1.quantile(x),c='r',lw=1.5)
plt.show()

In [None]:
# Distplot of charges_2 (%)
sns.distplot(train['charges_2 (%)'],color='g')
plt.grid('True')
for x in [0.25,0.50,0.75]:
    plt.axvline(train['charges_2 (%)'].quantile(x),color='m',lw=1.5)
plt.show()

In [None]:
train.Minimum_price.describe()


In [None]:
# Distplot of Minimum Price
sns.distplot(train.Minimum_price,color='r')
plt.grid('True')
for x in [0.25,0.50,0.75]:
    plt.axvline(train.Minimum_price.quantile(x),c='g',lw=1.5)
plt.show()

### Feature Engineering

In [None]:
# Create charges_1 (%) by dividing each column value in charges_1 by 100
train['charges_1 (%)'] = train['charges_1'] /100
# Create charges_2 by multiplying each column value in charges_2 (%) by 100
train['charges_2'] = train['charges_2 (%)'] *100
# Adding charges_1 and charges_2 to get total charges
train["total_charges"]= train['charges_1']+train['charges_2']
# Maximum - minium price = range
train["range"] = train['Maximum_price']-train['Minimum_price']

In [None]:
train.head()

###  Split the dataset into training and testing

In [None]:
train =train.drop(["instock_date","Product_id"],axis =1)


In [None]:
x = train
x = x.drop(["Selling_Price"],axis =1)
y = train['Selling_Price']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 40)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

### Model Building

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error

In [None]:
gbr = GradientBoostingRegressor(random_state=0)
xgb = xgboost.XGBRegressor(n_jobs=-1)
et = ExtraTreesRegressor(n_jobs=-1)
rf = RandomForestRegressor(n_jobs=-1)
ds = DecisionTreeRegressor()

In [None]:
reg = {
"LinearRegression": LinearRegression(),
"KNeighborsRegressor":KNeighborsRegressor(n_neighbors=2),
"AdaBoostRegressor":AdaBoostRegressor(random_state=0, n_estimators=100),
"LGBMRegressor":LGBMRegressor(),
"Ridge": Ridge(alpha=1.0),
"ElasticNet":ElasticNet(random_state=0),
"GradientBoostingRegressor":GradientBoostingRegressor(random_state=0),
"DecisionTreeRegressor": DecisionTreeRegressor(),
"ExtraTreesRegressor": ExtraTreesRegressor(n_jobs=-1),
"RandomForestRegressor": RandomForestRegressor(n_jobs=-1),
"XGBRegressor":xgboost.XGBRegressor(n_jobs=-1)
}

In [None]:
%%time
dic =  {"Model":[],"R2_Train":[],"RMSE_Train":[]}
for name, model in reg.items():

   
  model.fit(x_train, y_train)
  y_train_pre = model.predict(x_test)
  r2 = r2_score(y_test, y_train_pre)
  rmse  = np.sqrt(mean_squared_error(y_test, y_train_pre))
  print("--------------------------------------------------------------")
  print("Model:", name)
  print("-----Training Data Evalution-----")
  print("R2 Value: ", r2_score(y_test, y_train_pre))
  print("RMSE: ",np.sqrt(mean_squared_error(y_test, y_train_pre)))
  dic["Model"].append(name)
  dic["R2_Train"].append(r2)
  dic["RMSE_Train"].append(rmse)






In [None]:
final_data = pd.DataFrame(dic)
final_data

In [None]:
final_data.sort_values("RMSE_Train", axis = 0, ascending = True)

From the above we find that the best preictions are given by ExtraTreesRegressor and RandomForestRegressor.

#### Randomforest regression

In [None]:
from sklearn.ensemble import RandomForestRegressor 
# create regressor object 
rf = RandomForestRegressor(n_jobs=-1)
# fit the regressor with x and y data 
rf.fit(x_train, y_train)

In [None]:
pred_rf = rf.predict(x_test)
pred_rf = pd.DataFrame(pred_rf)

In [None]:
from sklearn.metrics import mean_squared_error 
print(mean_squared_error(y_test, pred_rf)**0.5)

In [None]:
# Seeing the feature importance 
importance = rf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

###### Feature selection

In [None]:
# Recursive Feature Elimination to select important features
from sklearn.feature_selection import RFE
sel2 = RFE(RandomForestRegressor(), n_features_to_select =5)
sel2.fit(x_train,y_train)

In [None]:
sel2.get_support()

In [None]:
features2 = x_train.columns[sel2.get_support()]
features2

In [None]:
x_train_rfe = sel2.transform(x_train)
x_test_rfe = sel2.transform(x_test)
clf = RandomForestRegressor()
clf.fit(x_train_rfe,y_train)
y_pred_rfe =clf.predict(x_test_rfe)
from sklearn.metrics import mean_squared_error 
print(mean_squared_error(y_test, y_pred_rfe)**0.5)

When I used RandomForestRegressor along with recursive feature elimination to predict the given test data, my public score on the leaderboard was 89.83489.   

#### ExtraTreesRegressor


In [None]:
et = ExtraTreesRegressor(n_jobs=-1)
et.fit(x_train,y_train)
y_pred_et =et.predict(x_test)
from sklearn.metrics import mean_squared_error 
print(mean_squared_error(y_test, y_pred_et)**0.5)

In [None]:
importance = et.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
# Recursive Feature Elimination to select important features 
sel3 = RFE(ExtraTreesRegressor(n_jobs=-1),n_features_to_select =5)
sel3.fit(x_train,y_train)
sel3.get_support()
features3 = x.columns[sel3.get_support()]
features3


When I used ExtraTreesRegressor along with recursive feature elimination to predict the given test data, my public score on the leaderboard was 89.82776.

#### VotingRegressor

In [None]:
# I used VotingRegressor with base regressors as ExtraTreesRegressor and RandomForestRegressor as they both gave the best predictions as described in previous cells


from sklearn.ensemble import VotingRegressor
model = VotingRegressor([('et', et),('rf',rf)],n_jobs=-1)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
r2_test = r2_score(y_test, y_pred)
rmse_test  = np.sqrt(mean_squared_error(y_test, y_pred))
print("R2 Value: ", r2_test)
print("RMSE: ",rmse_test)

Finally, I used the VotingRegressor as my final model and trained it on the entire given training data. 

In [None]:


model = VotingRegressor([('et', et),('rf',rf)],n_jobs=-1)

model.fit(x,y)

y_pred = model.predict(x)
r2_test = r2_score(y, y_pred)
rmse_test  = np.sqrt(mean_squared_error(y, y_pred))
print("R2 Value: ", r2_test)
print("RMSE: ",rmse_test)

### Test data

In [None]:
test.head()

In [None]:
# Finding number of null values in each column
test.isnull().sum()


In [None]:
test['Customer_name'] = test['Customer_name'].fillna(test['Customer_name'].mode()[0])


In [None]:
test.head()

In [None]:
# Applying Label Encoding to categorical columns
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 

# Encode labels in column 'Customer_name'. 
test['Customer_name']= label_encoder.fit_transform(test['Customer_name']) 

# Encode labels in column 'Loyalty_customer'. 
test['Loyalty_customer']= label_encoder.fit_transform(test['Loyalty_customer']) 

# Encode labels in column 'Product_Category'. 
test['Product_Category']= label_encoder.fit_transform(test['Product_Category']) 


In [None]:
test['charges_1'] = test['charges_1'].fillna(test['charges_1'].mean())
test['charges_2 (%)'] = test['charges_2 (%)'].fillna(test['charges_2 (%)'].mean())
test['Minimum_price'] = test['Minimum_price'].fillna(test['Minimum_price'].mean())
test['Stall_no'] = test['Stall_no'].fillna(test['Stall_no'].median())


In [None]:
test['charges_1 (%)'] = test['charges_1'] /100
test['charges_2'] = test['charges_2 (%)'] *100
test["total_charges"]= test['charges_1']+train['charges_2']

In [None]:
test["range"] = test['Maximum_price']-test['Minimum_price']

In [None]:
test["total_charges"] = test["total_charges"].fillna(test["total_charges"].mean())


In [None]:
p_id = test["Product_id"]
test = test.drop(["Product_id","instock_date"],axis =1)

In [None]:
prediction = model.predict(test)

In [None]:
prediction = pd.DataFrame(prediction)
prediction

In [None]:
Prediction = pd.concat([p_id,prediction],axis =1)

In [None]:
Prediction

The final model led to a public score of 90.03197 and leaderboard rank 75.