In [1]:
import seaborn as sns
import plotly
import matplotlib as plt
import pandas as pd
import numpy as np
import statsmodels as sm
import copy
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('after_vs.csv')
raw_data = pd.read_csv('/home/siarhei/Programming/ML/Data/Predict Future Sales/sales_train.csv')
test = pd.read_csv('/home/siarhei/Programming/ML/Data/Predict Future Sales/test.csv')
items = pd.read_csv('/home/siarhei/Programming/ML/time folder/from_pc/internship/Data/items.csv')
shops = pd.read_csv('/home/siarhei/Programming/ML/time folder/from_pc/internship/Data/shops.csv')
items_desc = pd.read_csv('/home/siarhei/Programming/ML/time folder/from_pc/internship/Data/items.csv')

## Data preparing

In [4]:
class Feature_extraction:
    def __init__(self, data):
        self.data = data
        
    def add_global_item_age(self):
        matrix = copy.deepcopy(self.data)    
        oldest_date_year_month = matrix.groupby('item_id')['date_block_num'].min()
        merged_df = pd.merge(matrix, oldest_date_year_month, on='item_id')
        merged_df = merged_df.rename(columns={'date_block_num_y': 'oldest_date'})
        merged_df['item_age'] = merged_df['date_block_num_x'] - merged_df['oldest_date']
        merged_df.drop(columns=['oldest_date'], inplace=True)
        self.data = copy.deepcopy(merged_df)
        self.data.rename(columns={'date_block_num_x':'date_block_num'}, inplace=True)

    def add_sales_interval(self, fill_na=True, value = 6666.0):
        matrix = copy.deepcopy(self.data)
        matrix['date_block_num_diff'] = matrix.groupby('item_id')['date_block_num'].diff().apply(lambda x: x-1 if x == 1 else x)
        if fill_na:
            idx = matrix['date_block_num_diff'].isnull( )
            matrix['date_block_num_diff'][ idx ] = value
        self.data = copy.deepcopy(matrix)
    
    def add_avg_sales(self, nan_values=0.0):
        df = copy.deepcopy(self.data)
        df['average_prev_sales'] = np.nan
        date_block_nums = df['date_block_num'].unique()

        for date_block_num in date_block_nums:
            if date_block_num == 0:
                prev_sales = df[df['date_block_num'] == date_block_num]
                prev_sales = prev_sales.groupby('item_id')['item_cnt_day'].sum()/(date_block_num+1)
                df.loc[df['date_block_num'] == date_block_num,'average_prev_sales'] = nan_values
                continue
            prev_sales = df[df['date_block_num'] < date_block_num]
            prev_sales = prev_sales.groupby('item_id')['item_cnt_day'].sum()/(date_block_num)
            df.loc[df['date_block_num'] == date_block_num,'average_prev_sales'] = df.loc[df['date_block_num'] == date_block_num,'item_id'].map(prev_sales)
        df.fillna(nan_values, inplace=True)
        self.data = copy.deepcopy(df)
    
    def add_shop_age(self):
        matrix = copy.deepcopy(self.data)
        min_date_block_num = matrix.groupby('shop_id')['date_block_num'].min()
        matrix = pd.merge(matrix, min_date_block_num, on='shop_id', how='left', suffixes=('', '_min'))
        matrix['shop_age_in_months'] = matrix['date_block_num'] - matrix['date_block_num_min']
        matrix.drop(columns=['date_block_num_min'], inplace=True)
        self.data = copy.deepcopy(matrix)
    
    def add_store_interval(self, fill_na=True, value = 9999.9):
        matrix = copy.deepcopy(self.data)
        matrix = matrix.sort_values(by=['shop_id','date_block_num','date'])
        matrix['month_from_prev_sale'] = matrix.groupby('shop_id')['date_block_num'].diff()
        
        if fill_na:
            idx = matrix['month_from_prev_sale'].isnull( )
            matrix['month_from_prev_sale'][ idx ] = value
        self.data = copy.deepcopy(matrix)
    
    def add_monthly_sales(self):
        matrix = copy.deepcopy(self.data)
        matrix['monthly_sales'] = matrix.groupby(['date_block_num','shop_id','item_id'])['item_cnt_day'].transform('sum')
        self.data = copy.deepcopy(matrix)
    
    def sort_data(self, column_name):
        self.data.sort_values(by=column_name, inplace=True)
    
    def get_data(self):
        return self.data

#Check DS 2.1 VS, there was preprocessing

#a = copy.deepcopy(data)
#FE = Feature_extraction(a)
#FE.sort_data('date_block_num')
#FE.add_sales_interval()
#FE.add_monthly_sales()
#FE.add_shop_age()
#FE.add_store_interval()
#FE.add_avg_sales()
#FE.add_global_item_age()
#FE.sort_data('date_block_num')
#df = FE.get_data()

## ModelValidation

In [52]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

class ModelValidator:
    def __init__(self, data, target_name, param_grid):
        
        self.model = None
        self.full_data = data
        self.target_data = data[[target_name]]
        self.target_name = target_name
        #Get data without target column
        self.data = data.drop(target_name, axis = 1)
        self.tscv = None
        self.train_x = None
        self.train_y = None
        self.test_x = None
        self.test_y = None
        self.param_grid = param_grid
        self.best_models = {}
        self.best_params_models = {}

    def get_model(self):
        return self.model
    
    def get_test(self):
        return self.test_x, self.test_y
    
    def get_train(self):
        return self.train_x, self.train_y

    def create_k_folds(self, data, n_splits=33):
        unique_date_block_nums = data['date_block_num'].unique()
        kf = KFold(n_splits=n_splits)
        for train_index, test_index in kf.split(unique_date_block_nums):
            train_date_block_nums = unique_date_block_nums[train_index]
            test_date_block_nums = unique_date_block_nums[test_index]
            train_data = data[data['date_block_num'].isin(train_date_block_nums)]
            test_data = data[data['date_block_num'].isin(test_date_block_nums)]
            yield (train_data, test_data)
    
    def grid_search_with_date_block_num_k_folds(self, models, param_grid, n_splits=33):
        from sklearn.model_selection import GridSearchCV
    
        self.best_params_models = {}
    
        # Iterate through the models
        for model in models.keys():
            for train_data, test_data in self.create_k_folds(self.full_data, n_splits):
                print(train_data.shape[0])
                # Initialize GridSearchCV with the model, param_grid, cv, and return_train_score
                grid_search = GridSearchCV(models[model], param_grid[model], return_train_score=True)
                # Fit the data to the grid search
                grid_search.fit(train_data.drop('monthly_sales', axis=1), train_data['monthly_sales'])
    
                # Print the results of the grid search
                print(f'Model: {model}')
                print("Best score: ", grid_search.best_score_)
                print("Best parameters: ", grid_search.best_params_)
                print("Test Score: ", grid_search.score(test_data.drop('monthly_sales', axis=1), test_data['monthly_sales']))
                print("="*50)
    
                # Save the best parameters and best model to the dictionary
                self.best_params_models[model] = (grid_search.best_params_, grid_search.best_estimator_)


    
    def grid_search_with_time_series_split(self, models, param_grid):
            from sklearn.model_selection import TimeSeriesSplit
            from sklearn.model_selection import GridSearchCV
            
            # Set the index of the dataframe to the date_block_num column
            self.full_data = self.full_data.set_index('date_block_num')
            
            # Initialize TimeSeriesSplit with the number of splits (33)
            self.tscv = TimeSeriesSplit(n_splits=33)
            
            # Initialize a dictionary to store the best parameters and models
            self.best_params_models = {}
            
            # Iterate through the models
            for model in models.keys():
                # Initialize GridSearchCV with the model, param_grid, cv, and return_train_score
                grid_search = GridSearchCV(models[model], param_grid[model], cv=self.tscv, return_train_score=True)
                
                # Fit the data to the grid search
                grid_search.fit(self.full_data.drop('monthly_sales', axis=1), self.full_data['monthly_sales'])
                
                print(f'Model: {model}')
                print("Best score: ", grid_search.best_score_)
                print("Best parameters: ", grid_search.best_params_)

                
                # Save the best parameters and best model to the dictionary
                self.best_params_models[model] = (grid_search.best_params_, grid_search.best_estimator_)

    def train_model(self):
        for i in range(len(self.train_x)):
            self.model.fit(self.train_x[i], self.train_y[i])
            y_pred = self.model.predict(self.test_x[i])
            print(f'Batch : {i}')
            if i >=32:
                print(f'Batch : {i}\nMSE : {mean_squared_error(self.test_y[i], y_pred)}\nMAE : {mean_absolute_error(self.test_y[i], y_pred)}\n R2 : {r2_score(self.test_y[i], y_pred)}')
    
    def split_data(self, n_splits=33):
        from sklearn.model_selection import TimeSeriesSplit
        # Create a TimeSeriesSplit object with the number of splits you want
        self.tscv = TimeSeriesSplit(n_splits=33)
        
        # Get the index of the dataframe
        self.full_data = self.full_data.set_index('date_block_num')
        
        # Iterate through the splits
        for train_index, test_index in self.tscv.split(self.full_data):
            # Get the train and test data for the current iteration
            self.train_x, self.train_y = self.full_data.iloc[train_index, :], self.full_data['monthly_sales'].iloc[train_index]
            print(self.train_x.shape)
            self.test_x, self.test_y = self.full_data.iloc[test_index, :], self.full_data['monthly_sales'].iloc[test_index]
            # Train and test your model on the data

    def train_sliding_windows(self, window_size=7, step=1):
        from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
        
        # Set the index of the dataframe to the date_block_num column
        self.full_data['date_block_num_x'] = self.full_data['date_block_num']
        self.full_data = self.full_data.set_index('date_block_num_x')
        
        # Initialize starting index
        start_index = 0
        end_index = window_size
    
        while end_index <= self.full_data.index.max():
            # Get the train and test data for the current iteration
            train_x = self.full_data.drop('monthly_sales', axis=1).loc[start_index:end_index-1]
            train_y = self.full_data['monthly_sales'].loc[start_index:end_index-1]
            test_x = self.full_data.drop('monthly_sales', axis=1).loc[end_index:end_index+step-1]
            test_y = self.full_data['monthly_sales'].loc[end_index:end_index+step-1]
            # Fit the model to the training data
            self.model.fit(train_x, train_y)
            
            # Make predictions on the test data
            y_pred = self.model.predict(test_x)
            
            # Print metrics
            print(f'\nMSE : {mean_squared_error(test_y, y_pred)}\nMAE : {mean_absolute_error(test_y, y_pred)}\nR2 : {r2_score(test_y, y_pred)}\n')
            
            # Update the starting index for the next iteration
            start_index += step
            end_index += step
        
    #works better than expanding
    def fixed_split(self, months_to_train):
        self.train_x = self.data[self.data.date_block_num <=months_to_train]
        self.train_y = self.target_data.head(self.train_x.shape[0])
        self.test_x = self.data[self.data.date_block_num >months_to_train]
        self.test_y = self.target_data.tail(self.test_x.shape[0])
    
    #overfit to first... month
    def expanding_windows(self, num_of_windows=33):
        splited_by_month_x = []
        splited_by_month_y = []
        
        data_blocks = self.data['date_block_num'].unique()
        
        for month in data_blocks:
            df = self.data[self.data['date_block_num'].isin([month])]
            indices = self.data.index[self.data['date_block_num'] == month]
            
            splited_by_month_x.append(df.values.tolist())
            splited_by_month_y.append(self.target_data.loc[indices].values.tolist())
        
        self.train_x = [splited_by_month_x[0]]
        self.train_y = [splited_by_month_y[0]]
        
        self.test_x = [splited_by_month_x[1]]
        self.test_y = [splited_by_month_y[1]]
        
        for i in range(1, num_of_windows):
            self.train_x.append(self.train_x[-1]+splited_by_month_x[i])
            self.train_y.append(self.train_y[-1]+splited_by_month_y[i])

            self.test_x.append(splited_by_month_x[i+1])
            self.test_y.append(splited_by_month_y[i+1])

In [53]:
from sklearn.linear_model import Lasso
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.ensemble import AdaBoostRegressor

In [49]:
#drop all duplicates to avoid overfitting
df = df.groupby(['date_block_num', 'shop_id', 'item_id']).last().reset_index()
test = df[df.is_train == False]
train = df[df['is_train']==True]
train.drop(columns=['date','item_cnt_day', 'is_train', 'ID','Unnamed: 0'], inplace=True)
test.drop(columns=['date','item_cnt_day', 'is_train', 'monthly_sales', 'Unnamed: 0'], inplace=True)

In [None]:
param_grid = {
    'lr_lasso': {'alpha': [0.1, 0.3, 0.5, 0.7, 1.0]},
    'lr_ridge': {'alpha': [0.1, 0.3, 0.5, 0.7, 1.0]},
    'elastic_net': {'alpha': [0.1, 0.3, 0.5, 0.7, 1.0], 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]},
    'decision_trees': {'max_depth': [3, 5, 7, 9, 11]},
    'rf': {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 9, 11], 'n_jobs': [-1]},
    'svr': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'bayesian_lr': {'n_iter': [200, 300, 400], 'tol': [0.001, 0.01, 0.1]},
    'xgboost': {'learning_rate': [0.1, 0.01], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 9, 11], 'n_jobs': [-1]},
    'lightGBM': {'learning_rate': [0.1, 0.01], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 9, 11], 'n_jobs': [-1]},
    'catboost': {'learning_rate': [0.1, 0.01], 'n_estimators': [50, 100, 200]},
    'adaboost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.01, 1]},
}
models = {
    'lr_lasso': Lasso()
    'lr_ridge': Ridge(),
    'elastic_net': ElasticNet(),
    'decision_trees': DecisionTreeRegressor(),
    'rf': RandomForestRegressor(),
    'svr': SVR(),
    'bayesian_lr': BayesianRidge(),
    'xgboost': xgb.XGBRegressor(),
    'lightGBM': lgb.LGBMRegressor(),
    'catboost': CatBoostRegressor(),
    'adaboost': AdaBoostRegressor()
}


mv = ModelValidator(data=train, target_name='monthly_sales', param_grid=param_grid)
#mv.grid_search_with_date_block_num_k_folds(models, param_grid, n_splits=33)
mv.grid_search_with_time_series_split(models, param_grid)

The cell above gave me the following output

    Model: lr_lasso
    Best score:  8.17786178270079242
    Best parameters:  {'alpha': 0.1}
    
    Model: lr_ridge
    Best score:  8.17358088914778932
    Best parameters:  {'alpha': 1.0}
    
    Model: elastic_net
    Best score:  7.17784508532135762
    Best parameters:  {'alpha': 0.1, 'l1_ratio': 0.9}
    
    Model: decision_trees
    Best score:  0.31183776324220747
    Best parameters:  {'max_depth': 5}
    
    Model: rf
    Best score:  0.40362900835055154
    Best parameters:  {'max_depth': 9, 'n_estimators': 100, 'n_jobs': -1}
    
    Model: catboost
    Best score:  0.47858537853716376
    Best parameters:  {'learning_rate': 0.1, 'n_estimators': 200}
    
here there was an explicit overfitting, but the search for the necessary parameters is still useful

Main work will be with CatBoost(as we agreed)