In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)


In [None]:
df = pd.read_csv('../input/big-mart-sales-prediction/Train.csv')

In [None]:
df.head(100)

In [None]:
df.shape

In [None]:
# total unique items 
len(df['Item_Identifier'].unique())

In [None]:
# total unique outlets
df['Outlet_Type'].value_counts()

In [None]:
df.corr()

In [None]:
# item sales in each of outlets
pd.concat([df['Outlet_Identifier'].loc[df['Item_Identifier']=='DRC01'],
          df['Item_Outlet_Sales'].loc[df['Item_Identifier']=='DRC01']],
          axis=1).reset_index()

In [None]:
df.isnull().sum()

In [None]:
df.Item_Fat_Content.unique()

In [None]:
# replace all the NaN/Null values using mean/mode.
df['Item_Weight'] = df['Item_Weight'].fillna(df['Item_Weight'].mean())
df['Outlet_Size'] = df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0])

In [None]:
df.isnull().sum()

In [None]:
# convert Outlet_Establishment_Year into number of year it has been established.
df['Outlet_Establishment_Year'] = 2020 - df['Outlet_Establishment_Year']
df = df.rename({'Outlet_Establishment_Year': 'No_Of_Years'}, axis=1)

In [None]:
df.head(10)

In [None]:
df['No_Of_Years'].corr(df['Item_Outlet_Sales'])

In [None]:
df.shape

In [None]:
df.columns

In [None]:
test_df = pd.read_csv('../input/big-mart-sales-prediction/Test.csv')

In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
test_df.isnull().sum()

In [None]:
# replace all the NaN/Null values using mean/mode.
test_df['Item_Weight'] = test_df['Item_Weight'].fillna(test_df['Item_Weight'].mean())
test_df['Outlet_Size'] = test_df['Outlet_Size'].fillna(test_df['Outlet_Size'].mode()[0])

In [None]:
test_df.isnull().sum()

In [None]:
# convert Outlet_Establishment_Year into number of year it has been established.
test_df['Outlet_Establishment_Year'] = 2020 - test_df['Outlet_Establishment_Year']
test_df = test_df.rename({'Outlet_Establishment_Year': 'No_Of_Years'}, axis=1)

In [None]:
combined_df = pd.concat([df, test_df], axis=0, sort=False)

In [None]:
combined_df.head()

In [None]:
categorical_columns = ['Item_Fat_Content',
       'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'No_Of_Years']

def category_onehot_multcols(multcolumns):
    #df_final = combined_df
    i = 0
    for fields in multcolumns:
    
        df1 = pd.get_dummies(combined_df[fields], drop_first=True)
        
        combined_df.drop([fields], axis=1, inplace=True)
        if i==0:
            df_final = df1.copy()
        else:
            df_final = pd.concat([df_final, df1], axis=1)
        i=i+1
    
    df_final = pd.concat([combined_df, df_final], axis=1)

    return df_final

combined_df = category_onehot_multcols(categorical_columns)

In [None]:
combined_df.shape


In [None]:
combined_df.head()

In [None]:
combined_df.drop(['Item_Identifier'], axis=1, inplace=True)

In [None]:
combined_df.head()

In [None]:
scaler = MinMaxScaler()

In [None]:
Item_Outlet_Sales_df = combined_df['Item_Outlet_Sales']

In [None]:
combined_df.drop(['Item_Outlet_Sales'], axis=1, inplace=True)

In [None]:
columns = combined_df.columns

In [None]:
columns

In [None]:
combined_df.head()

In [None]:
combined_df = scaler.fit_transform(combined_df)

In [None]:
combined_df

In [None]:
combined_df = pd.DataFrame(combined_df,columns=columns)

In [None]:
combined_df.head()

In [None]:
# divide train and test data
df_train = combined_df.iloc[:8523, :]
df_test = combined_df.iloc[8523:, :]

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
X = df_train
y = Item_Outlet_Sales_df[:8523]

In [None]:
X_test = df_test


In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [None]:
# defining a function which calculates details of each algorithm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


def model_details(model, alg_name):
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    acc = round(model.score(X_val, y_val)*100, 2)
    cvs = cross_val_score(model, X_val, y_val, cv=5)
    mean = round(cvs.mean()*100, 2)
    std = round(cvs.std()*2, 2)
    print('Model Report')
    print('Accuracy of {}: {}%'.format(alg_name, acc))
    print('RMSE Value: ', round(rmse, 2))
    print('Cross Validation Score: Mean - {} | Std - {}'.format(mean, std))
    

# XGBOOST IMPLEMENTATION !

In [None]:
!pip install xgboost
import xgboost
regressor = xgboost.XGBRegressor()

In [None]:
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster = ['gbtree', 'gblinear']
learning_rate = [0.05,0.1,0.15,0.20]
min_child_weight = [1,2,3,4]
base_score = [0.25, 0.5, 0.75, 1]

# define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators' : n_estimators,
    'max_depth' : max_depth,
    'learning_rate' : learning_rate,
    'min_child_weight' : min_child_weight,
    'booster' : booster,
    'base_score' : base_score
}

In [None]:
# set up the random search with 4-fold cross validation
from sklearn.model_selection import RandomizedSearchCV
random_cv = RandomizedSearchCV(estimator=regressor,
                              param_distributions=hyperparameter_grid,
                              cv=5, n_iter=50,
                              scoring = 'neg_mean_absolute_error', n_jobs=4,
                              verbose=5,
                              return_train_score = True,
                              random_state  = 42)

In [None]:
#random_cv.fit(X_train, y_train)

In [None]:
#random_cv.best_estimator_

In [None]:
regressor = xgboost.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=2,
             min_child_weight=1, missing=None, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
regressor.fit(X_train, y_train)

In [None]:
model_details(regressor, 'XGBoost')

In [None]:
X_test.shape

In [None]:
y_pred = regressor.predict(X_test)
y_pred

In [None]:
x=list(y_pred)
x[30]

# RANDOM FOREST IMPLEMENTATION !

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_random = RandomForestRegressor()

In [None]:
### hyperparameters 

# number of trees in random forrest
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]

# number of features to consider at every split
max_features = ['auto', 'sqrt']

# maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num=6)]

# minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]

# minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
# create the random grid
random_grid = {
                'n_estimators' : n_estimators,
                'max_features' : max_features,
                'max_depth' : max_depth,
                'min_samples_split' : min_samples_split,
                'min_samples_leaf' : min_samples_leaf
}

In [None]:
rf = RandomForestRegressor()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rf_random = RandomizedSearchCV(
        estimator = rf, 
        param_distributions = random_grid,
        scoring = 'neg_mean_squared_error',
        n_iter = 10,
        cv = 5,
        verbose = 2,
        random_state = 42,
        n_jobs = 1
)

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
model_details(rf_random, 'Random Forest')

In [None]:
X_test.shape

In [None]:
y_pred = rf_random.predict(X_test)
y_pred

# Writing To The Submission File !

In [None]:
pred = pd.DataFrame(y_pred)

sub = pd.read_csv('../input/big-mart-sales-prediction/Submission.csv')

sub['Item_Outlet_Sales'] = pred
sub.to_csv('submission_rf.csv', index=False)


In [None]:
sub[sub['Item_Outlet_Sales'] < 0]

In [None]:
sub