In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#import required libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
datasets = {
    "train": "/kaggle/input/rossmann-store-sales/train.csv",
    "test": "/kaggle/input/rossmann-store-sales/test.csv",
    "store": "/kaggle/input/rossmann-store-sales/store.csv"
}

In [None]:
store_data = pd.read_csv(datasets["store"])
train_data = pd.read_csv(datasets["train"])
test_data =pd.read_csv(datasets["test"])

# Data Info

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
test_data.info()

In [None]:
test_data.describe()

In [None]:
store_data.info()

In [None]:
store_data.describe()

In [None]:
df=store_data.merge(train_data,on=["Store"],how="inner")
df.head()

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(df.corr(), linewidths=0.1, vmax=1.0, 
            square=True, cmap=plt.cm.RdBu, linecolor='white', annot=True)

In [None]:
df.hist(figsize=(20,12), color="#107009AA", legend=True)
plt.show()

In [None]:
test_df = store_data.merge(test_data,on=["Store"],how="inner")

In [None]:
test_df.info()

# Handling Missing Values

In [None]:
# % of missing values in coumns
df.isna().sum() / df.shape[0] * 100

В колонках [Promo2SinceWeek, Promo2SinceYear, PromoInterval ] > 30% данных пропущенно, поэтому работать с ними будет очень тяжело, а восстановление может сильно ухудшить работу модели.

In [None]:
drop_missing_values_columns = ["Promo2SinceWeek", "Promo2SinceYear", "PromoInterval"]
df = df.drop(drop_missing_values_columns, axis = 1)

In [None]:
df[["CompetitionDistance", "CompetitionOpenSinceMonth", "CompetitionOpenSinceYear"]].hist(figsize=(20,12), color="#107009AA", legend=True)
plt.show()

In [None]:
df.CompetitionDistance.fillna(df.CompetitionDistance.mode()[0], inplace=True)
df.CompetitionOpenSinceYear.fillna(df.CompetitionOpenSinceYear.mode()[0], inplace=True)
df.CompetitionOpenSinceMonth = df.CompetitionOpenSinceMonth.ffill().bfill()

In [None]:
df[["CompetitionDistance", "CompetitionOpenSinceMonth", "CompetitionOpenSinceYear"]].hist(figsize=(20,12), color="#107009AA", legend=True)
plt.show()

In [None]:
test_df.isna().sum() / df.shape[0] * 100

In [None]:
drop_missing_values_columns = ["Promo2SinceWeek", "Promo2SinceYear", "PromoInterval"]
test_df = test_df.drop(drop_missing_values_columns, axis = 1)
test_df.Open.fillna(df.CompetitionDistance.mode()[0], inplace=True)
test_df.CompetitionDistance.fillna(test_df.CompetitionDistance.mode()[0], inplace=True)
test_df.CompetitionOpenSinceYear.fillna(test_df.CompetitionOpenSinceYear.mode()[0], inplace=True)
test_df.CompetitionOpenSinceMonth = test_df.CompetitionOpenSinceMonth.ffill().bfill()

In [None]:
test_df.info()

# Handling Outliers

In [None]:
def outliers_plot(dataframe, labels, typeOfPlot=0):
    if typeOfPlot in [0, 2]:
        plt.figure(figsize=(7,7))
        sns.scatterplot(data=dataframe, x=labels['x'], y=labels['y'])
        plt.show()
    if typeOfPlot in [1, 2]:    
        plt.figure(figsize=(7,7))
        sns.set(style="whitegrid")
        sns.boxenplot(data=dataframe,scale="linear",x=labels['x'],y=labels['y'], color="orange")
        plt.show()

In [None]:
outliers_plot(df, {'x': "DayOfWeek", 'y': "Sales"}, 1)

In [None]:
df["Sales"]=df["Sales"].apply(lambda x: 30000 if x>30000 else x)
outliers_plot(df, {'x': "DayOfWeek", 'y': "Sales"}, 2)

In [None]:
outliers_plot(df, {'x': "Customers", 'y': "Sales"}, 0)

In [None]:
df["Customers"]=df["Customers"].apply(lambda x: 5000 if x>5000 else x)
outliers_plot(df, {'x': "Customers", 'y': "Sales"}, 0)

# Handling duplicates

In [None]:
df.duplicated(subset=['Date', 'Store']).unique()

# Time columns

In [None]:
# data extraction
df['Date'] = pd.to_datetime(df["Date"])
df['Year'] = df["Date"].dt.year
df['Month'] = df["Date"].dt.month
df['Day'] = df["Date"].dt.day
df['WeekOfMonth'] = df["Date"].dt.isocalendar().week%4
df["Season"] = np.where(df["Month"].isin([3,4]),"Spring",np.where(df["Month"].isin([5,6,7,8]), "Summer",np.where(df["Month"].isin([9,10,11]),"Fall",np.where(df["Month"].isin([12,1,2]),"Winter","None"))))

In [None]:
week_data = pd.DataFrame({"Holiday per week": df["SchoolHoliday"],"WeekOfMonth": df["WeekOfMonth"],"Month": df["Month"],"Year": df["Year"],"Date": df["Date"]})
week_data = week_data.drop_duplicates(subset=['Date'])
week_data["Customers per Week"] = df["Customers"]
week_data["Promo per Week"] = df["Promo"]
week_data.groupby(["Year","Month","WeekOfMonth"]).sum()

In [None]:
df = df.merge(week_data.groupby(["Year", "Month", "WeekOfMonth"])["Holiday per week"].sum(), on=["Year", "Month", "WeekOfMonth"], how="inner")

In [None]:
test_df['Date'] = pd.to_datetime(test_df["Date"])
test_df['Year'] = test_df["Date"].dt.year
test_df['Month'] =test_df["Date"].dt.month
test_df['Day'] = test_df["Date"].dt.day
test_df['WeekOfMonth'] = test_df["Date"].dt.isocalendar().week%4
test_df["Season"] = np.where(test_df["Month"].isin([3,4]),"Spring",np.where(test_df["Month"].isin([5,6,7,8]), "Summer",np.where(test_df["Month"].isin([9,10,11]),"Fall",np.where(test_df["Month"].isin([12,1,2]),"Winter","None"))))

In [None]:
week_test_data = pd.DataFrame({"Holiday per week": test_df["SchoolHoliday"],"WeekOfMonth": test_df["WeekOfMonth"],"Month": test_df["Month"],"Year": test_df["Year"],"Date": test_df["Date"]})
week_test_data = week_test_data.drop_duplicates(subset=['Date'])
test_df = test_df.merge(week_test_data.groupby(["Year", "Month", "WeekOfMonth"])["Holiday per week"].sum(), on=["Year", "Month", "WeekOfMonth"], how="inner")

In [None]:
test_df.info()

# Handling Categorical Data

In [None]:
def column_hist(c):
    print("Values in %")
    print(100 * df[c].value_counts() / len(df[c]))
    df[c].value_counts().plot(kind='bar', color="#107009AA")
    plt.show()

In [None]:
categorical_data = df[df.select_dtypes(include=[object]).columns]
for feature in categorical_data:
    column_hist(feature)

In [None]:
for feature in df.select_dtypes(include=[object]).columns:
    if feature != 'Date':
        df[feature] = df[feature].astype('category')
        df[feature] = df[feature].cat.codes

In [None]:
for feature in test_df.select_dtypes(include=[object]).columns:
    if feature != 'Date':
        test_df[feature] = test_df[feature].astype('category')
        test_df[feature] = test_df[feature].cat.codes

# Data corr analyse

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(df.corr(), linewidths=0.1, vmax=1.0, 
            square=True, cmap=plt.cm.RdBu, linecolor='white', annot=True)

In [None]:
drop_corr_columns = ["Open"]
df.drop(drop_corr_columns, axis=1)

In [None]:
df.hist(figsize=(20,12), color="#107009AA", legend=True)
plt.show()

In [None]:
test_df.drop(["Open"], axis = 1)

# StoreType analyse

In [None]:
df.groupby('StoreType')['Sales'].describe()

In [None]:
df.groupby('StoreType')['Customers', 'Sales'].sum()

In [None]:
sns.factorplot(data = df, x = 'Month', y = "Sales", 
               col = 'StoreType',
               palette = 'plasma',
               hue = 'StoreType',
               row = 'Promo',
               color = 'red') 

In [None]:
sns.factorplot(data = df, x = 'Month', y = "Customers", 
               col = 'StoreType',
               palette = 'plasma',
               hue = 'StoreType',
               row = 'Promo',
               color = 'red') 

In [None]:
sns.factorplot(data = df, x = 'Month', y = "Sales", 
               col = 'DayOfWeek',
               palette = 'plasma',
               hue = 'StoreType',
               row = 'StoreType',
               color = 'red') 

In [None]:
df.info()

In [None]:
test_df.info()

# Modeling

In [None]:
X_train=df[:int(df.shape[0]*80/100)].drop(["Date", "Sales"], axis=1)
Y_train=df[:int(df.shape[0]*80/100)]["Sales"]
X_test=df[int(df.shape[0]*80/100):].drop(["Date", "Sales"], axis=1)
Y_test=df[int(df.shape[0]*80/100):]["Sales"]

In [None]:
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def RMSPE(y_predict, y):
    w = ToWeight(y_predict)
    rmspe = np.sqrt(np.mean( w * (y_predict - y)**2 ))
    return rmspe 

In [None]:
def train_and_predict(name, algorithm, train_data, test_data):
    algorithm.fit(train_data['x'], train_data['y'])
    print(name, " model")
    y_pred=base_lgb_model.predict(test_data['x'])
    y_pred = np.array(list(map(lambda x: 0 if x < 0 else x, y_pred)))
    rsmpe = RMSPE(y_pred, test_data['y'])
    print("RSMPE : ", rsmpe)
    print("Model score : ",algorithm.score(test_data['x'], test_data['y']))
    pred_df=pd.DataFrame({'Predictions':y_pred,'Actual':test_data['y']}).reset_index(drop=True)
    print(pred_df.head())
    print('-'*100)
    return rsmpe

In [None]:
import lightgbm as lgb
base_lgb_model = lgb.LGBMRegressor(n_leaves=50,
                               n_estimators=700, 
                               max_depth=-1,  
                               learning_rate=0.3, #0.1
                               subsample=1,
                               colsample_bytree=0.8,
                               reg_alpha=0.1,
                               reg_lambda=1)

In [None]:
from sklearn.linear_model import LinearRegression
linear_reg_model=LinearRegression()

In [None]:
models = {
    "LGBMRegressor": base_lgb_model,
    "LinearRegression": linear_reg_model
}

In [None]:
models_rsmpe = {}
for model in models:
    models_rsmpe[model] = train_and_predict(model, models[model], {'x': X_train, 'y': Y_train}, {'x': X_test, 'y':Y_test})

In [None]:
print(models_rsmpe)
print("Min rsmpe :", models_rsmpe[min(models_rsmpe.items(), key=lambda x: x[1])[0]], "in  model : ", min(models_rsmpe.items(), key=lambda x: x[1])[0])

In [None]:
X_final_train = df.drop(["Date","Sales"], axis=1)
Y_final_train = df["Sales"]
X_final_test = test_df.drop(["Date"], axis=1)
test_id = X_final_test["Id"]
X_final_test = X_final_test.drop(["Id"], axis=1)

In [None]:
X_final_train, X_final_test = X_final_train.align(X_final_test, join = 'inner', axis = 1)
print('Train data : ', X_final_train.shape)
print('Test data : ', X_final_test.shape)

In [None]:
model = models[min(models_rsmpe.items(), key=lambda x: x[1])[0]]
model.fit(X_final_train, Y_final_train)
y_pred=base_lgb_model.predict(X_final_test)

In [None]:
submit = pd.DataFrame(y_pred, columns = ['Sales'])
submit['Id'] = test_id
submit.to_csv('submission.csv', index = False)