In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('X_train.csv')
test = pd.read_csv("X_test.csv")
train.head()

Unnamed: 0,Item_Outlet_Sales,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Years_operation,Item_Fat_Content_1,Item_Fat_Content_2,Outlet_Location_Type_1,Outlet_Location_Type_2,...,Item_Type_Combined_2,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,3735.138,0.282525,0.048866,0.927507,0.583333,0.416667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,443.4228,0.081274,0.058705,0.072068,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2097.27,0.770765,0.051037,0.468288,0.583333,0.416667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,732.38,0.871986,0.0,0.640093,0.541667,0.458333,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,994.7052,0.260494,0.0,0.095805,0.083333,0.916667,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Seperate Features and Target
X= train.drop(columns = ['Item_Outlet_Sales'], axis=1)
Y= train['Item_Outlet_Sales']

In [4]:
# 20% data as validation set
X_train,X_valid,Y_train,Y_valid = train_test_split(X,Y,test_size=0.2,random_state=22)

#### Model Building

In [18]:
algos = [LinearRegression(),  Ridge(), Lasso(), KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor(random_state=0, n_estimators=150)]
names = ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'K Neighbors Regressor', 'Decision Tree Regressor', 'Random Forest Regressor']
rmse_list = []

In [19]:
for name in algos:
    model = name
    model.fit(X_train,Y_train)
    y_pred = model.predict(X_valid)
    MSE= metrics.mean_squared_error(Y_valid,y_pred)
    rmse = np.sqrt(MSE)
    rmse_list.append(rmse)

In [20]:
evaluation = pd.DataFrame({'Model': names,
                           'RMSE': rmse_list})
evaluation

Unnamed: 0,Model,RMSE
0,Linear Regression,1187.393939
1,Ridge Regression,1187.392639
2,Lasso Regression,1187.259835
3,K Neighbors Regressor,1267.028945
4,Decision Tree Regressor,1577.710651
5,Random Forest Regressor,1186.212469


#### As we can clearly see Random Forest performs slighlty better than Ridge, Lasso and Linear regression. KNN Regressor and Decision Tree Regressor do not improve the score so we can select Random Forest for making our final predictions

In [21]:
submission = pd.read_csv('sample_submission_8RXa3c6.csv')
model = RandomForestRegressor(random_state=0, n_estimators=150)
model.fit(X, Y)
final_predictions = model.predict(test)
submission['Item_Outlet_Sales'] = final_predictions
#only positive predictions for the target variable
submission['Item_Outlet_Sales'] = submission['Item_Outlet_Sales'].apply(lambda x: 0 if x<0 else x)
submission.to_csv('rohan_submission.csv', index=False)