In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
df = pd.read_csv('data/long_data.csv')
df.head()

Unnamed: 0,States,Regions,latitude,longitude,Dates,Usage
0,Punjab,NR,31.519974,75.980003,02/01/2019 00:00:00,119.9
1,Haryana,NR,28.450006,77.019991,02/01/2019 00:00:00,130.3
2,Rajasthan,NR,26.449999,74.639981,02/01/2019 00:00:00,234.1
3,Delhi,NR,28.669993,77.230004,02/01/2019 00:00:00,85.8
4,UP,NR,27.599981,78.050006,02/01/2019 00:00:00,313.9


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16599 entries, 0 to 16598
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   States     16599 non-null  object 
 1   Regions    16599 non-null  object 
 2   latitude   16599 non-null  float64
 3   longitude  16599 non-null  float64
 4   Dates      16599 non-null  object 
 5   Usage      16599 non-null  float64
dtypes: float64(3), object(3)
memory usage: 778.2+ KB


In [4]:
# Convert dtype of 'Dates' column to datetime
df['Dates'] = pd.to_datetime(df['Dates'], format="%d/%m/%Y %H:%M:%S")

# Add day of the week column
df['DayOfWeek'] = df['Dates'].dt.dayofweek

# Add day of the week column
df['WeekOfMonth'] = (df['Dates'].dt.day - 1) // 7 + 1

# Add month column
df['Month'] = df['Dates'].dt.month
df.sample(10)

Unnamed: 0,States,Regions,latitude,longitude,Dates,Usage,DayOfWeek,WeekOfMonth,Month
12018,HP,NR,31.100025,77.166597,2020-01-13,19.7,0,2,1
3585,Bihar,ER,25.785414,87.479973,2019-04-23,69.0,1,4,4
15868,Manipur,NER,24.799971,93.950017,2020-08-03,2.6,0,1,8
3968,Chandigarh,NR,30.719997,76.780006,2019-05-06,5.3,0,1,5
14725,J&K,NR,33.45,76.24,2020-04-25,46.4,5,4,4
55,Jharkhand,ER,23.800393,86.419986,2019-01-03,25.6,3,1,1
9933,Punjab,NR,31.519974,75.980003,2019-11-02,82.0,5,1,11
5561,Karnataka,SR,12.570381,76.919997,2019-06-24,233.6,0,4,6
6885,Bihar,ER,25.785414,87.479973,2019-07-29,70.6,0,5,7
2106,Assam,NER,26.749981,94.216667,2019-03-08,36.8,4,2,3


In [5]:
population_density_dict = {
    "Punjab": 551, "Haryana": 573, "Rajasthan": 201, "Delhi": 11297, "UP": 828,
    "Uttarakhand": 189, "HP": 123, "J&K": 297, "Chandigarh": 350, "Chhattisgarh": 189,
    "Gujarat": 308, "MP": 236, "Maharashtra": 365, "Goa": 394, "DNH": 970,
    "Andhra Pradesh": 303, "Telangana": 312, "Karnataka": 319, "Kerala": 859,
    "Tamil Nadu": 555, "Pondy": 2598, "Bihar": 1106, "Jharkhand": 414, "Odisha": 269,
    "West Bengal": 1028, "Sikkim": 86, "Arunachal Pradesh": 17, "Assam": 398,
    "Manipur": 122, "Meghalaya": 132, "Mizoram": 52, "Nagaland": 119, "Tripura": 350
}

df['PopulationDensity'] = df['States'].map(population_density_dict)
df[['States', 'PopulationDensity']]

Unnamed: 0,States,PopulationDensity
0,Punjab,551
1,Haryana,573
2,Rajasthan,201
3,Delhi,11297
4,UP,828
...,...,...
16594,Manipur,122
16595,Meghalaya,132
16596,Mizoram,52
16597,Nagaland,119


In [6]:
df.drop('Regions', axis=1, inplace=True)
df.head()

Unnamed: 0,States,latitude,longitude,Dates,Usage,DayOfWeek,WeekOfMonth,Month,PopulationDensity
0,Punjab,31.519974,75.980003,2019-01-02,119.9,2,1,1,551
1,Haryana,28.450006,77.019991,2019-01-02,130.3,2,1,1,573
2,Rajasthan,26.449999,74.639981,2019-01-02,234.1,2,1,1,201
3,Delhi,28.669993,77.230004,2019-01-02,85.8,2,1,1,11297
4,UP,27.599981,78.050006,2019-01-02,313.9,2,1,1,828


In [7]:
df.describe()

Unnamed: 0,latitude,longitude,Dates,Usage,DayOfWeek,WeekOfMonth,Month,PopulationDensity
count,16599.0,16599.0,16599,16599.0,16599.0,16599.0,16599.0,16599.0
mean,23.17822,81.794533,2019-09-25 13:27:18.966202624,103.001862,2.992048,2.652087,5.910537,785.151515
min,8.900373,71.1924,2019-01-02 00:00:00,0.3,0.0,1.0,1.0,17.0
25%,19.82043,76.569993,2019-05-11 00:00:00,6.7,1.0,1.0,3.0,189.0
50%,23.835404,78.570026,2019-09-12 00:00:00,64.4,3.0,3.0,5.0,319.0
75%,27.33333,88.329947,2020-01-26 00:00:00,173.9,5.0,4.0,9.0,555.0
max,33.45,94.216667,2020-12-05 00:00:00,522.1,6.0,5.0,12.0,11297.0
std,6.146575,7.258429,,116.044056,1.987079,1.289324,3.461615,1916.984043


Dates: Captures a span from 2019-01-02 to 2020-12-05.

Usage:
- Min = 0.3, Max = 522.1 (high variance).
- Mean = 103.0, Std = 116.04 — indicates significant variability in consumption.

DayOfWeek: Cyclical (0 to 6)

WeekOfMonth: Cyclical (1 to 5)

Month: Cyclical (1 to 12)

PopulationDensity:
- Highly skewed (min = 17, max = 11,297).
- Needs normalization or log transformation.

Outliers:
The high PopulationDensity (max = 11,297) could impact the model.
Skewed Usage values might need scaling or transformation.

In [8]:
df.drop(['latitude', 'longitude'], axis=1, inplace=True)
df.head()

Unnamed: 0,States,Dates,Usage,DayOfWeek,WeekOfMonth,Month,PopulationDensity
0,Punjab,2019-01-02,119.9,2,1,1,551
1,Haryana,2019-01-02,130.3,2,1,1,573
2,Rajasthan,2019-01-02,234.1,2,1,1,201
3,Delhi,2019-01-02,85.8,2,1,1,11297
4,UP,2019-01-02,313.9,2,1,1,828


In [9]:
# Create lag features
for lag in range(1, 4):  # Lag 1 to 3
    df[f'lag_{lag}'] = df['Usage'].shift(lag)

df.head()

Unnamed: 0,States,Dates,Usage,DayOfWeek,WeekOfMonth,Month,PopulationDensity,lag_1,lag_2,lag_3
0,Punjab,2019-01-02,119.9,2,1,1,551,,,
1,Haryana,2019-01-02,130.3,2,1,1,573,119.9,,
2,Rajasthan,2019-01-02,234.1,2,1,1,201,130.3,119.9,
3,Delhi,2019-01-02,85.8,2,1,1,11297,234.1,130.3,119.9
4,UP,2019-01-02,313.9,2,1,1,828,85.8,234.1,130.3


In [10]:
nan_counts = df.isna().sum()

print("Number of NaN values in each column:")
print(nan_counts)

Number of NaN values in each column:
States               0
Dates                0
Usage                0
DayOfWeek            0
WeekOfMonth          0
Month                0
PopulationDensity    0
lag_1                1
lag_2                2
lag_3                3
dtype: int64


In [11]:
# Drop rows with NaN values (after lag creation)
df = df.dropna().reset_index(drop=True)

print(f"DataFrame shape after dropping NaNs: {df.shape}")

DataFrame shape after dropping NaNs: (16596, 10)


In [12]:
df.sample(5)

Unnamed: 0,States,Dates,Usage,DayOfWeek,WeekOfMonth,Month,PopulationDensity,lag_1,lag_2,lag_3
8245,Nagaland,2019-09-10,2.1,1,2,9,119,1.8,5.8,2.9
2496,West Bengal,2019-03-20,101.9,2,3,3,1028,67.4,21.8,70.2
7880,Meghalaya,2019-08-29,5.4,3,5,8,132,2.6,31.6,2.2
7934,Karnataka,2019-08-31,177.7,5,5,8,319,161.5,159.2,18.7
4126,UP,2019-05-11,244.4,5,2,5,828,84.7,197.9,143.0


In [13]:
df.describe()

Unnamed: 0,Dates,Usage,DayOfWeek,WeekOfMonth,Month,PopulationDensity,lag_1,lag_2,lag_3
count,16596,16596.0,16596.0,16596.0,16596.0,16596.0,16596.0,16596.0,16596.0
mean,2019-09-25 14:36:42.169197312,102.991299,2.992227,2.652386,5.911424,785.213606,103.005206,103.012931,103.020059
min,2019-01-02 00:00:00,0.3,0.0,1.0,1.0,17.0,0.3,0.3,0.3
25%,2019-05-11 00:00:00,6.7,1.0,1.0,3.0,189.0,6.7,6.7,6.7
50%,2019-09-12 00:00:00,64.35,3.0,3.0,5.0,319.0,64.4,64.4,64.45
75%,2020-01-26 00:00:00,173.9,5.0,4.0,9.0,555.0,173.9,173.9,173.9
max,2020-12-05 00:00:00,522.1,6.0,5.0,12.0,11297.0,522.1,522.1,522.1
std,,116.049814,1.987214,1.289249,3.461298,1917.150376,116.051696,116.049246,116.046649


- All numeric columns (Usage, DayOfWeek, WeekOfMonth, Month, PopulationDensity, lag_1, lag_2, lag_3) have counts equal to the total rows 16596
- The `Dates` column is non-numeric, so its NaN status needs to be explicitly checked.
- `lag_1`, `lag_2`, `lag_3` are consistent with the Usage column in terms of mean, min, max, and distribution.
- High variability in `PopulationDensity` column (mean = 785, std = 1917, max = 11297) - Needs to be normalized.
- Wide range in `Usage` column (0.3 to 522.1) - Needs to be normalized.

In [14]:
# Checking for NaNs in Dates column
print(df.isnull().sum())

States               0
Dates                0
Usage                0
DayOfWeek            0
WeekOfMonth          0
Month                0
PopulationDensity    0
lag_1                0
lag_2                0
lag_3                0
dtype: int64


In [15]:
print("Number of rows with negative usage values:", len(df[df['Usage'] < 0]))  # Negative usage values
print("Number of rows with 500+ usage values:", len(df[df['Usage'] > 500]))  # Extremely high usage values (above 99th percentile)

Number of rows with negative usage values: 0
Number of rows with 500+ usage values: 25


In [16]:
# Check for NaNs and validate date range
print(df['Dates'].isna().sum())
print(df['Dates'].min(), df['Dates'].max())

0
2019-01-02 00:00:00 2020-12-05 00:00:00


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16596 entries, 0 to 16595
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   States             16596 non-null  object        
 1   Dates              16596 non-null  datetime64[ns]
 2   Usage              16596 non-null  float64       
 3   DayOfWeek          16596 non-null  int32         
 4   WeekOfMonth        16596 non-null  int32         
 5   Month              16596 non-null  int32         
 6   PopulationDensity  16596 non-null  int64         
 7   lag_1              16596 non-null  float64       
 8   lag_2              16596 non-null  float64       
 9   lag_3              16596 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int32(3), int64(1), object(1)
memory usage: 1.1+ MB


Non-null Count: All columns have 16596 non-null values.

Data Types:
- `States`: Object type (categorical)
- `Dates`: datetime64[ns]
- `Usage`, `lag_1`, `lag_2`, `lag_3`: float64 (appropriate for numerical features)
- `DayOfWeek`, `WeekOfMonth`, `Month`: int32 (ideal for categorical variables that are numeric)
- `PopulationDensity`: int64 (numeric and could be normalized)

## Normalization and One-hot Encoding

In [18]:
# Normalize numeric columns

from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Specify the columns to scale
cols_to_scale = ['PopulationDensity', 'lag_1', 'lag_2', 'lag_3']

# Fit and transform the data (i.e., normalize the selected columns)
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# Check the updated dataframe
print(df.head())

        States      Dates  Usage  DayOfWeek  WeekOfMonth  Month  \
0        Delhi 2019-01-02   85.8          2            1      1   
1           UP 2019-01-02  313.9          2            1      1   
2  Uttarakhand 2019-01-02   40.7          2            1      1   
3           HP 2019-01-02   30.0          2            1      1   
4          J&K 2019-01-02   52.5          2            1      1   

   PopulationDensity     lag_1     lag_2     lag_3  
0           1.000000  0.448064  0.249138  0.229207  
1           0.071897  0.163856  0.448064  0.249138  
2           0.015248  0.600997  0.163856  0.448064  
3           0.009397  0.077424  0.600997  0.163856  
4           0.024823  0.056918  0.077424  0.600997  


In [19]:
encoded_df = pd.get_dummies(df, columns=['States'], drop_first=True)

In [20]:
print(encoded_df.info())
print(encoded_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16596 entries, 0 to 16595
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Dates                     16596 non-null  datetime64[ns]
 1   Usage                     16596 non-null  float64       
 2   DayOfWeek                 16596 non-null  int32         
 3   WeekOfMonth               16596 non-null  int32         
 4   Month                     16596 non-null  int32         
 5   PopulationDensity         16596 non-null  float64       
 6   lag_1                     16596 non-null  float64       
 7   lag_2                     16596 non-null  float64       
 8   lag_3                     16596 non-null  float64       
 9   States_Arunachal Pradesh  16596 non-null  bool          
 10  States_Assam              16596 non-null  bool          
 11  States_Bihar              16596 non-null  bool          
 12  States_Chandigarh 

Numerical columns:
- `Usage`, `PopulationDensity`, `lag_1`, `lag_2`, `lag_3` are normalized between 0 and 1.

Categorical columns:
- `States` has been one-hot encoded into binary columns for each state (32 new columns).

In [21]:
df = encoded_df

In [22]:
df.head()

Unnamed: 0,Dates,Usage,DayOfWeek,WeekOfMonth,Month,PopulationDensity,lag_1,lag_2,lag_3,States_Arunachal Pradesh,...,States_Pondy,States_Punjab,States_Rajasthan,States_Sikkim,States_Tamil Nadu,States_Telangana,States_Tripura,States_UP,States_Uttarakhand,States_West Bengal
0,2019-01-02,85.8,2,1,1,1.0,0.448064,0.249138,0.229207,False,...,False,False,False,False,False,False,False,False,False,False
1,2019-01-02,313.9,2,1,1,0.071897,0.163856,0.448064,0.249138,False,...,False,False,False,False,False,False,False,True,False,False
2,2019-01-02,40.7,2,1,1,0.015248,0.600997,0.163856,0.448064,False,...,False,False,False,False,False,False,False,False,True,False
3,2019-01-02,30.0,2,1,1,0.009397,0.077424,0.600997,0.163856,False,...,False,False,False,False,False,False,False,False,False,False
4,2019-01-02,52.5,2,1,1,0.024823,0.056918,0.077424,0.600997,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
# Drop Dates Column (for LSTM model)
df = df.drop(columns=['Dates'])

In [24]:
# Check for any remaining missing values
missing_values = df.isnull().sum()
print(f"Missing values:\n{missing_values}")

# Confirm correct data types
print(df.dtypes)

Missing values:
Usage                       0
DayOfWeek                   0
WeekOfMonth                 0
Month                       0
PopulationDensity           0
lag_1                       0
lag_2                       0
lag_3                       0
States_Arunachal Pradesh    0
States_Assam                0
States_Bihar                0
States_Chandigarh           0
States_Chhattisgarh         0
States_DNH                  0
States_Delhi                0
States_Goa                  0
States_Gujarat              0
States_HP                   0
States_Haryana              0
States_J&K                  0
States_Jharkhand            0
States_Karnataka            0
States_Kerala               0
States_MP                   0
States_Maharashtra          0
States_Manipur              0
States_Meghalaya            0
States_Mizoram              0
States_Nagaland             0
States_Odisha               0
States_Pondy                0
States_Punjab               0
States_Rajasthan        

## Train Test Split

In [25]:
train_size = int(0.8 * len(df))  # 80% for training, 20% for testing

train_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]

In [26]:
print(train_df.isnull().sum())  # Check for missing values in training data
print(test_df.isnull().sum())   # Check for missing values in testing data

Usage                       0
DayOfWeek                   0
WeekOfMonth                 0
Month                       0
PopulationDensity           0
lag_1                       0
lag_2                       0
lag_3                       0
States_Arunachal Pradesh    0
States_Assam                0
States_Bihar                0
States_Chandigarh           0
States_Chhattisgarh         0
States_DNH                  0
States_Delhi                0
States_Goa                  0
States_Gujarat              0
States_HP                   0
States_Haryana              0
States_J&K                  0
States_Jharkhand            0
States_Karnataka            0
States_Kerala               0
States_MP                   0
States_Maharashtra          0
States_Manipur              0
States_Meghalaya            0
States_Mizoram              0
States_Nagaland             0
States_Odisha               0
States_Pondy                0
States_Punjab               0
States_Rajasthan            0
States_Sik

In [27]:
print(train_df.dtypes)  # Check dtypes for training data
print(test_df.dtypes)   # Check dtypes for testing data

Usage                       float64
DayOfWeek                     int32
WeekOfMonth                   int32
Month                         int32
PopulationDensity           float64
lag_1                       float64
lag_2                       float64
lag_3                       float64
States_Arunachal Pradesh       bool
States_Assam                   bool
States_Bihar                   bool
States_Chandigarh              bool
States_Chhattisgarh            bool
States_DNH                     bool
States_Delhi                   bool
States_Goa                     bool
States_Gujarat                 bool
States_HP                      bool
States_Haryana                 bool
States_J&K                     bool
States_Jharkhand               bool
States_Karnataka               bool
States_Kerala                  bool
States_MP                      bool
States_Maharashtra             bool
States_Manipur                 bool
States_Meghalaya               bool
States_Mizoram              

In [28]:
print(train_df.describe())
print(test_df.describe())

              Usage     DayOfWeek   WeekOfMonth         Month  \
count  13276.000000  13276.000000  13276.000000  13276.000000   
mean     103.061555      2.985312      2.755348      5.987270   
std      116.042180      1.997968      1.246415      3.598104   
min        0.300000      0.000000      1.000000      1.000000   
25%        6.700000      1.000000      2.000000      3.000000   
50%       64.300000      3.000000      3.000000      6.000000   
75%      174.000000      5.000000      4.000000      9.000000   
max      522.100000      6.000000      5.000000     12.000000   

       PopulationDensity         lag_1         lag_2         lag_3  
count       13276.000000  13276.000000  13276.000000  13276.000000  
mean            0.068141      0.196899      0.196887      0.196850  
std             0.170079      0.222305      0.222297      0.222251  
min             0.000000      0.000000      0.000000      0.000000  
25%             0.015248      0.012265      0.012265      0.012265  


**Usage Distribution**
- Training Data: Usage values range from 0 to 1, with a mean of ~0.1969. The data appears to be uniformly distributed.
- Testing Data: Usage values also range from 0 to ~1, with a mean of ~0.1963, which is very close to the training data's distribution.

**DayOfWeek, WeekOfMonth, Month**
- Both the DayOfWeek, WeekOfMonth, and Month columns appear to have reasonable distributions and the same ranges across both training and testing datasets.

**PopulationDensity**
- Training Data: The PopulationDensity values range from 0 to 1, with a mean of ~0.0681 and a fairly wide standard deviation (0.17), which suggests some sparsely populated regions.
- Testing Data: The PopulationDensity values have similar characteristics to the training data, with a mean of ~0.0680.

**Lag Features (lag_1, lag_2, lag_3)**
- All three lag features have values ranging from 0 to 1 across both the training and testing datasets, with very similar distributions. This ensures the lags are consistently represented across both splits.

In [29]:
# Separate features (X) and target (y) for training and testing data
X_train = train_df.drop(columns=['Usage'])
y_train = train_df['Usage']
X_test = test_df.drop(columns=['Usage'])
y_test = test_df['Usage']

In [30]:
# Check shapes of training and testing data
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (13276, 39)
X_test shape: (3320, 39)
y_train shape: (13276,)
y_test shape: (3320,)


In [31]:
X_train.columns

Index(['DayOfWeek', 'WeekOfMonth', 'Month', 'PopulationDensity', 'lag_1',
       'lag_2', 'lag_3', 'States_Arunachal Pradesh', 'States_Assam',
       'States_Bihar', 'States_Chandigarh', 'States_Chhattisgarh',
       'States_DNH', 'States_Delhi', 'States_Goa', 'States_Gujarat',
       'States_HP', 'States_Haryana', 'States_J&K', 'States_Jharkhand',
       'States_Karnataka', 'States_Kerala', 'States_MP', 'States_Maharashtra',
       'States_Manipur', 'States_Meghalaya', 'States_Mizoram',
       'States_Nagaland', 'States_Odisha', 'States_Pondy', 'States_Punjab',
       'States_Rajasthan', 'States_Sikkim', 'States_Tamil Nadu',
       'States_Telangana', 'States_Tripura', 'States_UP', 'States_Uttarakhand',
       'States_West Bengal'],
      dtype='object')

# Building RandomForestRegressor Model

In [32]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

Mean Absolute Error: 7.854741726907631
Mean Squared Error: 389.60710911600785


### Feature Importance

In [33]:
feature_importances = rf_model.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                     Feature    Importance
3          PopulationDensity  2.410003e-01
23        States_Maharashtra  2.363534e-01
4                      lag_1  2.144452e-01
6                      lag_3  1.973684e-01
5                      lag_2  3.950238e-02
33         States_Tamil Nadu  2.153645e-02
30             States_Punjab  2.082437e-02
29              States_Pondy  5.102284e-03
18                States_J&K  3.338459e-03
36                 States_UP  3.087975e-03
2                      Month  3.060239e-03
14                States_Goa  2.750215e-03
38        States_West Bengal  2.653416e-03
19          States_Jharkhand  2.612758e-03
1                WeekOfMonth  1.895431e-03
13              States_Delhi  1.426719e-03
0                  DayOfWeek  1.198004e-03
15            States_Gujarat  6.989151e-04
37        States_Uttarakhand  3.450495e-04
12                States_DNH  1.705250e-04
16                 States_HP  1.588536e-04
21             States_Kerala  1.272246e-04
11       St

In [34]:
import joblib

# Save the model
joblib.dump(rf_model, 'artifacts/random_forest_model.pkl')

# Save the scaler
joblib.dump(scaler, 'artifacts/scaler.pkl')

# Save the df
X_train.to_csv('data/X_train.csv')
X_test.to_csv('data/X_test.csv')
y_train.to_csv('data/y_train.csv')
y_test.to_csv('data/y_test.csv')

print("Model, dataframes, and scaler saved successfully.")

Model, dataframes, and scaler saved successfully.


In [35]:
X_train.columns

Index(['DayOfWeek', 'WeekOfMonth', 'Month', 'PopulationDensity', 'lag_1',
       'lag_2', 'lag_3', 'States_Arunachal Pradesh', 'States_Assam',
       'States_Bihar', 'States_Chandigarh', 'States_Chhattisgarh',
       'States_DNH', 'States_Delhi', 'States_Goa', 'States_Gujarat',
       'States_HP', 'States_Haryana', 'States_J&K', 'States_Jharkhand',
       'States_Karnataka', 'States_Kerala', 'States_MP', 'States_Maharashtra',
       'States_Manipur', 'States_Meghalaya', 'States_Mizoram',
       'States_Nagaland', 'States_Odisha', 'States_Pondy', 'States_Punjab',
       'States_Rajasthan', 'States_Sikkim', 'States_Tamil Nadu',
       'States_Telangana', 'States_Tripura', 'States_UP', 'States_Uttarakhand',
       'States_West Bengal'],
      dtype='object')

In [59]:
import pickle

# Save the model using pickle
with open('random_forest_model_v2.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

In [65]:
with open('scaler_v2.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)