In [1]:
pip install gradio




In [2]:
import gradio as gr
import numpy  as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.linear_model import LinearRegression
from joblib import dump

In [3]:
train=pd.read_csv("train_data.csv")
test=pd.read_csv("test.csv")
final=pd.read_csv("test.csv")

In [4]:
df=pd.concat([train,test])

In [5]:
df['date']=pd.to_datetime(df['date'],format = "%d-%m-%Y")
df['date']

0       2013-01-01
1       2013-01-02
2       2013-01-03
3       2013-01-04
4       2013-01-05
           ...    
44995   2018-03-27
44996   2018-03-28
44997   2018-03-29
44998   2018-03-30
44999   2018-03-31
Name: date, Length: 958000, dtype: datetime64[ns]

In [6]:
def cols_new(data_df):
    data_df['year'] = data_df['date'].dt.year
    data_df['quarter'] = data_df['date'].dt.quarter
    data_df['month'] = data_df['date'].dt.month
    data_df['weekofyear'] = data_df['date'].dt.weekofyear
    data_df['dayofweek'] = data_df['date'].dt.dayofweek
    return data_df

In [7]:
cols_new(df)

  data_df['weekofyear'] = data_df['date'].dt.weekofyear


Unnamed: 0,date,store,item,sales,id,year,quarter,month,weekofyear,dayofweek
0,2013-01-01,1,1,13.0,,2013,1,1,1,1
1,2013-01-02,1,1,11.0,,2013,1,1,1,2
2,2013-01-03,1,1,14.0,,2013,1,1,1,3
3,2013-01-04,1,1,13.0,,2013,1,1,1,4
4,2013-01-05,1,1,10.0,,2013,1,1,1,5
...,...,...,...,...,...,...,...,...,...,...
44995,2018-03-27,10,50,,44995.0,2018,1,3,13,1
44996,2018-03-28,10,50,,44996.0,2018,1,3,13,2
44997,2018-03-29,10,50,,44997.0,2018,1,3,13,3
44998,2018-03-30,10,50,,44998.0,2018,1,3,13,4


In [8]:
%%time
def mean_cols(data,cols):
    for i in cols:
        cols=[e for e in cols if e not in (i)]
        for j in cols :
            if i!=j :
                data['mean_'+i+'_'+j]=data.groupby([i,j])['sales'].transform('mean')
    return data

Wall time: 0 ns


In [9]:
df.columns

Index(['date', 'store', 'item', 'sales', 'id', 'year', 'quarter', 'month',
       'weekofyear', 'dayofweek'],
      dtype='object')

In [10]:
%%time
mean_cols(df,['item','store','dayofweek','weekofyear','month','quarter'])
print(df.columns)

Index(['date', 'store', 'item', 'sales', 'id', 'year', 'quarter', 'month',
       'weekofyear', 'dayofweek', 'mean_item_store', 'mean_item_dayofweek',
       'mean_item_weekofyear', 'mean_item_month', 'mean_item_quarter',
       'mean_store_dayofweek', 'mean_store_weekofyear', 'mean_store_month',
       'mean_store_quarter', 'mean_dayofweek_weekofyear',
       'mean_dayofweek_month', 'mean_dayofweek_quarter',
       'mean_weekofyear_month', 'mean_weekofyear_quarter',
       'mean_month_quarter'],
      dtype='object')
Wall time: 1.62 s


In [11]:
def median_cols(data,cols):
    for i in cols:
        cols=[e for e in cols if e not in (i)]
        for j in cols :
            if i!=j :
                data['median_'+i+'_'+j]=data.groupby([i,j])['sales'].transform('median')
    return data

In [12]:
%%time
median_cols(df,['item','store','dayofweek','weekofyear','month','quarter'])
print(df.columns)

Index(['date', 'store', 'item', 'sales', 'id', 'year', 'quarter', 'month',
       'weekofyear', 'dayofweek', 'mean_item_store', 'mean_item_dayofweek',
       'mean_item_weekofyear', 'mean_item_month', 'mean_item_quarter',
       'mean_store_dayofweek', 'mean_store_weekofyear', 'mean_store_month',
       'mean_store_quarter', 'mean_dayofweek_weekofyear',
       'mean_dayofweek_month', 'mean_dayofweek_quarter',
       'mean_weekofyear_month', 'mean_weekofyear_quarter',
       'mean_month_quarter', 'median_item_store', 'median_item_dayofweek',
       'median_item_weekofyear', 'median_item_month', 'median_item_quarter',
       'median_store_dayofweek', 'median_store_weekofyear',
       'median_store_month', 'median_store_quarter',
       'median_dayofweek_weekofyear', 'median_dayofweek_month',
       'median_dayofweek_quarter', 'median_weekofyear_month',
       'median_weekofyear_quarter', 'median_month_quarter'],
      dtype='object')
Wall time: 1.84 s


In [13]:
train = df.loc[~df.sales.isna()]
test = df.loc[df.sales.isna()]

In [14]:
X_train = train.drop(['date','sales','id'], axis=1)
y_train = train['sales'].values
X_test = test.drop(['id','date','sales'], axis=1)

In [15]:
x_train, x_validate, y_train, y_validate = train_test_split(X_train, y_train, random_state=100, test_size=0.25)

In [16]:
%%time
params = {
    'colsample_bytree': 0.8,
    'eta': 0.1,
    'eval_metric': 'mae',
    'lambda': 1,
    'max_depth': 6,
    'objective': 'reg:linear',
    'seed': 0,
    'silent': 1,
    'subsample': 0.8,
    'verbosity' : 0,
}
xgbtrain = xgb.DMatrix(x_train, label=y_train)
xgbvalidate = xgb.DMatrix(x_validate, label=y_validate)
xgbmodel = xgb.train(list(params.items()), xgbtrain, early_stopping_rounds=50,
                     evals=[(xgbtrain, 'train'), (xgbvalidate, 'validate')], 
                     num_boost_round=200, verbose_eval=50)

[0]	train-mae:46.58235	validate-mae:46.55130
[50]	train-mae:5.57603	validate-mae:5.58829
[100]	train-mae:5.55684	validate-mae:5.58238
[150]	train-mae:5.54535	validate-mae:5.58198
[199]	train-mae:5.53488	validate-mae:5.58254
Wall time: 4min 22s


In [17]:
print(test.columns)
print(final.columns)

Index(['date', 'store', 'item', 'sales', 'id', 'year', 'quarter', 'month',
       'weekofyear', 'dayofweek', 'mean_item_store', 'mean_item_dayofweek',
       'mean_item_weekofyear', 'mean_item_month', 'mean_item_quarter',
       'mean_store_dayofweek', 'mean_store_weekofyear', 'mean_store_month',
       'mean_store_quarter', 'mean_dayofweek_weekofyear',
       'mean_dayofweek_month', 'mean_dayofweek_quarter',
       'mean_weekofyear_month', 'mean_weekofyear_quarter',
       'mean_month_quarter', 'median_item_store', 'median_item_dayofweek',
       'median_item_weekofyear', 'median_item_month', 'median_item_quarter',
       'median_store_dayofweek', 'median_store_weekofyear',
       'median_store_month', 'median_store_quarter',
       'median_dayofweek_weekofyear', 'median_dayofweek_month',
       'median_dayofweek_quarter', 'median_weekofyear_month',
       'median_weekofyear_quarter', 'median_month_quarter'],
      dtype='object')
Index(['id', 'date', 'store', 'item'], dtype='object')

In [18]:
model = xgbmodel
predict=model.predict(xgb.DMatrix(X_test),ntree_limit=model.best_ntree_limit)
dump(xgbmodel, 'xgbmodel_model.joblib')
final['sales'] = np.round(predict)



In [20]:
def predict_sales(date,store,item):
    # sample= np.array([id,sales]).reshape(1,-1)
    ans = final.groupby(['item','store','date'])['sales'].mean()
    return (ans.loc[(item, store, date)])

In [21]:
input_components = [
    gr.inputs.Textbox(label="Date(DD-MM-YYYY)"),
    gr.inputs.Number(label="Store"),
    gr.inputs.Number(label="Item")
]



In [22]:
interface = gr.Interface(
    fn=predict_sales,
    inputs=input_components,
    outputs=gr.outputs.Textbox(label="Sales prediction"),
    title="Predicting sales",
    description="Predicting 3 months of sales given 5 year data",
    layout="vertical",
    theme="compact"
)



In [23]:
interface.launch(debug = False,share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://72018e24caa1d7562b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


