##### Load Packages

In [28]:
!pip install dtale

Collecting dtale
  Downloading dtale-3.16.1-py2.py3-none-any.whl.metadata (16 kB)
Collecting dash-colorscales (from dtale)
  Downloading dash_colorscales-0.0.4.tar.gz (62 kB)
     ---------------------------------------- 0.0/62.3 kB ? eta -:--:--
     ---------------------------------------- 0.0/62.3 kB ? eta -:--:--
     -------------------------- ------------- 41.0/62.3 kB 1.9 MB/s eta 0:00:01
     ---------------------------------------- 62.3/62.3 kB 1.1 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting dash-daq (from dtale)
  Downloading dash_daq-0.5.0.tar.gz (642 kB)
     ---------------------------------------- 0.0/642.7 kB ? eta -:--:--
     ------------- ------------------------ 225.3/642.7 kB 6.7 MB/s eta 0:00:01
     -------------------------------------  634.9/642.7 kB 8.0 MB/s eta 0:00:01
     -------------------------------------- 642.7/642.7 kB 6.7 MB/s eta 0:00:00
  Preparing metadata (setup.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 1. Data Loading
train = pd.read_csv('train_v9rqX0R.csv')
test = pd.read_csv('test_AbJTz2l.csv')
submission = pd.read_csv('sample_submission_8RXa3c6.csv')

In [3]:
# 2. Initial Data Exploration
print("Train data shape:", train.shape)
print("Test data shape:", test.shape)
print("\nTrain data info:")
print(train.info())

print("\nTest data info:")
print(test.info())

print("\nMissing values in train data:")
print(train.isnull().sum())

print("\nMissing values in test data:")
print(test.isnull().sum())

Train data shape: (8523, 12)
Test data shape: (5681, 11)

Train data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), obje

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [5]:
train.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [6]:
print(train['Item_Weight'].describe())
print(test['Item_Weight'].describe())

count    7060.000000
mean       12.857645
std         4.643456
min         4.555000
25%         8.773750
50%        12.600000
75%        16.850000
max        21.350000
Name: Item_Weight, dtype: float64
count    4705.000000
mean       12.695633
std         4.664849
min         4.555000
25%         8.645000
50%        12.500000
75%        16.700000
max        21.350000
Name: Item_Weight, dtype: float64


In [7]:
train['Item_Weight'].fillna(train['Item_Weight'].mean(),inplace=True)  #replacing null values with mean values
test['Item_Weight'].fillna(test['Item_Weight'].mean(),inplace=True)

In [8]:
train.isnull().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [9]:
print(train['Item_Weight'].describe())

count    8523.000000
mean       12.857645
std         4.226124
min         4.555000
25%         9.310000
50%        12.857645
75%        16.000000
max        21.350000
Name: Item_Weight, dtype: float64


In [10]:
train['Outlet_Size']  #it is a categorical value

0       Medium
1       Medium
2       Medium
3          NaN
4         High
         ...  
8518      High
8519       NaN
8520     Small
8521    Medium
8522     Small
Name: Outlet_Size, Length: 8523, dtype: object

In [11]:
train['Outlet_Size'].value_counts()

Outlet_Size
Medium    2793
Small     2388
High       932
Name: count, dtype: int64

In [12]:
train['Outlet_Size'].mode()

0    Medium
Name: Outlet_Size, dtype: object

In [13]:
train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0],inplace=True)
test['Outlet_Size'].fillna(test['Outlet_Size'].mode()[0],inplace=True)

In [14]:
train.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [15]:
test.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

In [16]:
# 3. Data Preprocessing and Feature Engineering
# Save identifiers for later
train_identifiers = train[['Item_Identifier', 'Outlet_Identifier']].copy()
test_identifiers = test[['Item_Identifier', 'Outlet_Identifier']].copy()

# Combine train and test for preprocessing
test['Item_Outlet_Sales'] = 0  # temporary column for test data
combined = pd.concat([train, test], axis=0)

In [17]:
# Feature Engineering
# 3.1 Extract item category from Item_Identifier
combined['Item_Category'] = combined['Item_Identifier'].str[:2]
combined['Item_Category'] = combined['Item_Category'].map({
    'FD': 'Food',
    'DR': 'Drinks',
    'NC': 'Non-Consumable'
})

In [18]:
# 3.2 Calculate outlet age
combined['Outlet_Years'] = 2013 - combined['Outlet_Establishment_Year']

In [19]:
# 3.3 Normalize Item_Fat_Content values
fat_content_map = {
    'Low Fat': 'Low Fat',
    'Regular': 'Regular',
    'LF': 'Low Fat',
    'low fat': 'Low Fat',
    'reg': 'Regular'
}
combined['Item_Fat_Content'] = combined['Item_Fat_Content'].map(fat_content_map)

In [20]:
# 3.4 Non-consumables should not have fat content
combined.loc[combined['Item_Category'] == 'Non-Consumable', 'Item_Fat_Content'] = 'None'

In [21]:
# 3.5 Item_Visibility - log transform to handle skewness
combined['Item_Visibility_Log'] = np.log1p(combined['Item_Visibility'])

In [22]:
# 3.6 Item_MRP - create price segments
combined['Item_MRP_Segment'] = pd.qcut(combined['Item_MRP'], 4, labels=['Economy', 'Standard', 'Premium', 'Super Premium'])

In [23]:
# 3.7 Encoding Categorical Variables
# Use label encoding for ordinal features and one-hot for nominal
# Label Encoding for ordinal variables
label_encoders = {}
for col in ['Outlet_Size', 'Item_MRP_Segment']:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))
    label_encoders[col] = le

In [24]:
# 3.8 One-hot encoding for nominal variables
# We'll use one-hot encoding for selected columns
nominal_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Category']
combined = pd.get_dummies(combined, columns=nominal_cols, drop_first=True)

In [25]:
# 5. Split back into train and test
train_processed = combined[combined['Item_Outlet_Sales'] > 0].drop('Item_Identifier', axis=1)
test_processed = combined[combined['Item_Outlet_Sales'] == 0].drop('Item_Outlet_Sales', axis=1)
test_ids = test_processed['Item_Identifier']
test_processed = test_processed.drop('Item_Identifier', axis=1)

In [None]:
train_processed

In [29]:
import dtale

In [30]:
dtale.show(train_processed)



In [26]:
# 6. Model Training
# Extract features and target
X = train_processed.drop('Item_Outlet_Sales', axis=1)
y = train_processed['Item_Outlet_Sales']

# Split data for validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
rf_model = RandomForestRegressor(n_estimators=300, max_depth=10, min_samples_split=5, 
                                 min_samples_leaf=2, n_jobs=-1, random_state=42)
rf_model.fit(X_train, y_train)

ValueError: could not convert string to float: 'OUT049'

In [35]:
# 7. Model Training
# Extract features and target
X = train_processed.drop('Item_Outlet_Sales', axis=1)
y = train_processed['Item_Outlet_Sales']

print("\nFeatures used in model:")
print(X.columns.tolist())

# Split data for validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Check for any remaining non-numeric columns
print("\nData types in training data:")
print(X_train.dtypes)


Features used in model:
['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Size', 'Outlet_Years', 'Item_Visibility_Log', 'Item_MRP_Segment', 'Item_Fat_Content_1', 'Item_Fat_Content_2', 'Item_Type_1', 'Item_Type_2', 'Item_Type_3', 'Item_Type_4', 'Item_Type_5', 'Item_Type_6', 'Item_Type_7', 'Item_Type_8', 'Item_Type_9', 'Item_Type_10', 'Item_Type_11', 'Item_Type_12', 'Item_Type_13', 'Item_Type_14', 'Item_Type_15', 'Outlet_Location_Type_1', 'Outlet_Location_Type_2', 'Outlet_Type_1', 'Outlet_Type_2', 'Outlet_Type_3', 'Item_Category_1', 'Item_Category_2']

Data types in training data:
Item_Weight               float64
Item_Visibility           float64
Item_MRP                  float64
Outlet_Size                 int32
Outlet_Years                int64
Item_Visibility_Log       float64
Item_MRP_Segment            int32
Item_Fat_Content_1           bool
Item_Fat_Content_2           bool
Item_Type_1                  bool
Item_Type_2                  bool
Item_Type_3                  bool


In [36]:
# Initialize and train the model
rf_model = RandomForestRegressor(n_estimators=300, max_depth=10, min_samples_split=5, 
                                min_samples_leaf=2, n_jobs=-1, random_state=42)
rf_model.fit(X_train, y_train)

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values