In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('X_train.csv')
test = pd.read_csv("X_test.csv")
train.head()

Unnamed: 0,Item_Outlet_Sales,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,3735.138,0.100128,0.282525,0.0,0.048866,0.266667,0.927507,1.0,0.583333,0.5,0.0,0.333333
1,443.4228,0.005135,0.081274,1.0,0.058705,0.933333,0.072068,0.333333,1.0,0.5,1.0,0.666667
2,2097.27,0.424904,0.770765,0.0,0.051037,0.666667,0.468288,1.0,0.583333,0.5,0.0,0.333333
3,732.38,0.719512,0.871986,1.0,0.0,0.4,0.640093,0.0,0.541667,0.5,1.0,0.0
4,994.7052,0.832478,0.260494,0.0,0.0,0.6,0.095805,0.111111,0.083333,0.0,1.0,0.333333


In [3]:
# Seperate Features and Target
X= train.drop(columns = ['Item_Outlet_Sales'], axis=1)
Y= train['Item_Outlet_Sales']

In [4]:
# 20% data as validation set
X_train,X_valid,Y_train,Y_valid = train_test_split(X,Y,test_size=0.2,random_state=22)

#### Model Building

In [9]:
algos = [LinearRegression(),  Ridge(), Lasso(), KNeighborsRegressor(), DecisionTreeRegressor()]
names = ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'K Neighbors Regressor', 'Decision Tree Regressor']
rmse_list = []

In [10]:
for name in algos:
    model = name
    model.fit(X_train,Y_train)
    y_pred = model.predict(X_valid)
    MSE= metrics.mean_squared_error(Y_valid,y_pred)
    rmse = np.sqrt(MSE)
    rmse_list.append(rmse)

In [12]:
evaluation = pd.DataFrame({'Model': names,
                           'RMSE': rmse_list})
evaluation

Unnamed: 0,Model,RMSE
0,Linear Regression,1246.20791
1,Ridge Regression,1246.109964
2,Lasso Regression,1245.629281
3,K Neighbors Regressor,1271.340568
4,Decision Tree Regressor,1570.928075


#### As we can clearly see Lasso performs slighlty better than Ridge and Linear regression. KNN Regressor and Decision Tree Regressor do not improve the score so we can select Lasso for making our final predictions

In [14]:
submission = pd.read_csv('sample_submission_8RXa3c6.csv')
model = Lasso()
model.fit(X, Y)
final_predictions = model.predict(test)
submission['Item_Outlet_Sales'] = final_predictions
#only positive predictions for the target variable
submission['Item_Outlet_Sales'] = submission['Item_Outlet_Sales'].apply(lambda x: 0 if x<0 else x)
submission.to_csv('rohan_submission.csv', index=False)