In [None]:
!pip install comet-ml

In [None]:
# import comet_ml at the top of your file
from comet_ml import Experiment

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
comet_api_key = user_secrets.get_secret("comet_api_key")

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# EDA

In [None]:
data = pd.read_csv('/kaggle/input/big-mart-salescsv/Train_UWu5bXk.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

two cols need cleaning

In [None]:
data.info()

## Visibility vs Sales

In [None]:
data.plot.scatter(x='Item_Visibility',
                 y='Item_Outlet_Sales',
                 title='Visibility vs Sales')

### Insight:
 visibility of less than 0.2 seems to have more sales

## Outlet vs Sales

In [None]:
data.groupby('Outlet_Identifier').Item_Outlet_Sales.sum().sort_values().plot.barh(title='Outlet vs Sales')

### Insight: 
- Outlet 27 is high sales
- Outlet 10 and 19 are the lowest

## Item type vs Sales

In [None]:
data.groupby('Item_Type').Item_Outlet_Sales.sum().sort_values().plot.barh(title='Type vs Sales')

### Insight:
- Fruit and Vege and Snack are among the top selling items
- Breakfast and Seafood are not selling much

## Feature Engineering Ideas:
- Create a levelled Visibiilty - Low, High
- Reduce Item_Type categories to Food, Drink, Non-consumable
- Operational Year = 2013 - Est. Year

# Data Pre-processing

## Item_Fat_Content

In [None]:
data.Item_Fat_Content.value_counts(dropna=False)

In [None]:
data.Item_Fat_Content.replace(to_replace=['LF','low fat'], value='Low Fat', inplace=True)
data.Item_Fat_Content.replace(to_replace=['reg'], value='Regular', inplace=True)
data.Item_Fat_Content.value_counts()

Item_Fat_Content is correctly labelled

In [None]:
data.isnull().sum()

## Item_Weight

In [None]:
data.Item_Identifier.value_counts()

In [None]:
data.Item_Weight = data.Item_Weight.fillna(data.groupby('Item_Identifier').Item_Weight.transform('mean'))

data.isnull().sum()

In [None]:
data[data.Item_Weight.isnull()]

In [None]:
data.Item_Weight = data.Item_Weight.fillna(data.groupby('Item_Type').Item_Weight.transform('mean'))
data.isnull().sum()

Item_Weight is imputed with groups formed by Item_Idenfier and Item_Type 

### Outlet_Size

In [None]:
data.Outlet_Size.value_counts(dropna=False)

In [None]:
pd.crosstab(index= data.Outlet_Size , columns=  data.Outlet_Type)

all grocery stores are small

In [None]:
data.Outlet_Size = data.Outlet_Size.fillna(data.Outlet_Type.map({'Grocery Store': 'Small'}))
data.Outlet_Size.value_counts(dropna=False)

In [None]:
pd.crosstab(index= data.Outlet_Size , columns=  data.Outlet_Location_Type)

all Tier 2 stores are small

In [None]:
data.Outlet_Size = data.Outlet_Size.fillna(data.Outlet_Location_Type.map({'Tier 2': 'Small'}))
data.Outlet_Size.value_counts(dropna=False)

Outlet_Size is imputed using Outlet_Type and Outlet_Location_Type

In [None]:
data.isnull().sum()

## Convert to numerical

In [None]:
data.select_dtypes(include=[object]).columns.to_list()

In [None]:
drop_cols = ['Item_Identifier','Outlet_Identifier', 'Item_Outlet_Sales']

cat_cols = ['Item_Fat_Content',
 'Item_Type',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


In [None]:
for col in cat_cols:
    data[col] = le.fit_transform(data[col])

data.head()

# Model Building/ Train

/kaggle/input/big-mart-salescsv/Train_UWu5bXk.csv

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection  import train_test_split
from sklearn.metrics import  make_scorer, mean_squared_error


In [None]:
## split data
X = data.drop(columns=drop_cols) 
y = data.Item_Outlet_Sales
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# fit model and predict
linear_model1 = LinearRegression()
linear_model1.fit(X_train, y_train)
y_pred = linear_model1.predict(X_test)

In [None]:
accuracy = linear_model1.score(X_test, y_test)
accuracy

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

In [None]:
#these will be logged to your sklearn-demos project on Comet.ml
params={"random_state":0,
        "model_type":"linear_reg",
        "scaler":"N",
        "param_grid":"N",
}

metrics = {
"accuracy":accuracy,
"rmse":rmse
}

# Create an experiment with your api key:
exp = Experiment(
    api_key=comet_api_key,
    project_name="codealong-big-mart-sales",
    workspace="maksteel",
)
exp.log_dataset_hash(X_train)
exp.log_parameters(params)
exp.log_metrics(metrics)
exp.end()

## Grid Search, model selection

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_score = make_scorer(rmse, greater_is_better=False)

pipeline = Pipeline([
    ('scaling', 'passthrough'),
    ('model','passthrough')
])

param_grid = {
    'scaling': [StandardScaler(), MinMaxScaler()],
    'model' : [LinearRegression(), Ridge(), Lasso(), ElasticNet()]
               
}
grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, scoring=rmse_score)

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
grid.score(X_test, y_test)

In [None]:
# track experiments on comet_ml

for i in range(len(grid.cv_results_['params'])):
    exp = Experiment(workspace="maksteel",
        project_name="codealong-big-mart-sales",
        api_key=comet_api_key)
    for k,v in grid.cv_results_.items():
        if k == "params":
            exp.log_parameters(v[i])
        else:
            exp.log_metric(k,v[i])
    exp.end()



# Prediction on Test set

/kaggle/input/big-mart-salescsv/Test_u94Q5KV.csv