In [1]:
# Dataset Reference: Alexis Cook, DanB, inversion, Ryan Holbrook. (2021). Store Sales - Time Series Forecasting. Kaggle. https://kaggle.com/competitions/store-sales-time-series-forecasting
import pandas as pd

# Read the CSV file into a DataFrame
store_main_df = pd.read_csv('train.csv')

# Let's take a look at the first few rows of the DataFrame to confirm it's loaded correctly
store_main_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [2]:
# Let's load each of the provided CSV files into their respective dataframes
holidays_events_df = pd.read_csv('holidays_events.csv')
oil_df = pd.read_csv('oil.csv')
stores_df = pd.read_csv('stores.csv')

# Checking the first few rows of each dataframe to understand their structure and contents
dfs_preview = {
    "holidays_events": holidays_events_df.head(),
    "oil": oil_df.head(),
    "stores": stores_df.head(),    
}

dfs_preview

{'holidays_events':          date     type    locale locale_name                    description  \
 0  2012-03-02  Holiday     Local       Manta             Fundacion de Manta   
 1  2012-04-01  Holiday  Regional    Cotopaxi  Provincializacion de Cotopaxi   
 2  2012-04-12  Holiday     Local      Cuenca            Fundacion de Cuenca   
 3  2012-04-14  Holiday     Local    Libertad      Cantonizacion de Libertad   
 4  2012-04-21  Holiday     Local    Riobamba      Cantonizacion de Riobamba   
 
    transferred  
 0        False  
 1        False  
 2        False  
 3        False  
 4        False  ,
 'oil':          date  dcoilwtico
 0  2013-01-01         NaN
 1  2013-01-02       93.14
 2  2013-01-03       92.97
 3  2013-01-04       93.12
 4  2013-01-07       93.20,
 'stores':    store_nbr           city                           state type  cluster
 0          1          Quito                       Pichincha    D       13
 1          2          Quito                       Pichincha

In [3]:
# Merging the mock store sales dataframe with the oil prices dataframe on 'date'
combined_df = pd.merge(store_main_df, oil_df, how='left', on='date')

# Merging the result with the holidays and events dataframe on 'date'
combined_df = pd.merge(combined_df, holidays_events_df, how='left', on='date', suffixes=('', '_holidays'))

# Merging the result with the stores dataframe on 'store_nbr'
combined_df = pd.merge(combined_df, stores_df, how='left', on='store_nbr', suffixes=('', '_stores'))

# Now let's check for null values in the combined dataframe
null_values_summary = combined_df.isnull().sum()

null_values_summary, combined_df.head()


(id                   0
 date                 0
 store_nbr            0
 family               0
 sales                0
 onpromotion          0
 dcoilwtico      955152
 type           2551824
 locale         2551824
 locale_name    2551824
 description    2551824
 transferred    2551824
 city                 0
 state                0
 type_stores          0
 cluster              0
 dtype: int64,
    id        date  store_nbr      family  sales  onpromotion  dcoilwtico  \
 0   0  2013-01-01          1  AUTOMOTIVE    0.0            0         NaN   
 1   1  2013-01-01          1   BABY CARE    0.0            0         NaN   
 2   2  2013-01-01          1      BEAUTY    0.0            0         NaN   
 3   3  2013-01-01          1   BEVERAGES    0.0            0         NaN   
 4   4  2013-01-01          1       BOOKS    0.0            0         NaN   
 
       type    locale locale_name         description transferred   city  \
 0  Holiday  National     Ecuador  Primer dia del ano       F

In [4]:
# Handling null values again
# Forward fill for oil prices, ensuring we fill NA values from the start of the series if needed
combined_df['dcoilwtico'].fillna(method='ffill', inplace=True)
combined_df['dcoilwtico'].fillna(method='bfill', inplace=True)  # Backward fill if forward fill didn't work

# Dropping the unnecessary columns as requested
columns_to_drop = ['type', 'locale', 'locale_name', 'description', 'transferred']
combined_df.drop(columns_to_drop, axis=1, inplace=True)

# Checking if all null values are handled and the index of the dataframe
null_values_resolved = combined_df.isnull().sum()
index_is_correct = combined_df.index.is_monotonic_increasing and combined_df.index.is_unique

null_values_resolved, index_is_correct, combined_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['dcoilwtico'].fillna(method='ffill', inplace=True)
  combined_df['dcoilwtico'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['dcoilwtico'].fillna(method='bfill', inplace=True)  # Backward fill if forward fill didn't w

(id             0
 date           0
 store_nbr      0
 family         0
 sales          0
 onpromotion    0
 dcoilwtico     0
 city           0
 state          0
 type_stores    0
 cluster        0
 dtype: int64,
 True,
    id        date  store_nbr      family  sales  onpromotion  dcoilwtico  \
 0   0  2013-01-01          1  AUTOMOTIVE    0.0            0       93.14   
 1   1  2013-01-01          1   BABY CARE    0.0            0       93.14   
 2   2  2013-01-01          1      BEAUTY    0.0            0       93.14   
 3   3  2013-01-01          1   BEVERAGES    0.0            0       93.14   
 4   4  2013-01-01          1       BOOKS    0.0            0       93.14   
 
     city      state type_stores  cluster  
 0  Quito  Pichincha           D       13  
 1  Quito  Pichincha           D       13  
 2  Quito  Pichincha           D       13  
 3  Quito  Pichincha           D       13  
 4  Quito  Pichincha           D       13  )

In [5]:
combined_df.tail(100)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,city,state,type_stores,cluster
3054248,3000788,2017-08-15,6,SEAFOOD,47.575,0,47.57,Quito,Pichincha,D,13
3054249,3000789,2017-08-15,7,AUTOMOTIVE,5.000,0,47.57,Quito,Pichincha,D,8
3054250,3000790,2017-08-15,7,BABY CARE,0.000,0,47.57,Quito,Pichincha,D,8
3054251,3000791,2017-08-15,7,BEAUTY,11.000,2,47.57,Quito,Pichincha,D,8
3054252,3000792,2017-08-15,7,BEVERAGES,3999.000,9,47.57,Quito,Pichincha,D,8
...,...,...,...,...,...,...,...,...,...,...,...
3054343,3000883,2017-08-15,9,POULTRY,438.133,0,47.57,Quito,Pichincha,B,6
3054344,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,47.57,Quito,Pichincha,B,6
3054345,3000885,2017-08-15,9,PRODUCE,2419.729,148,47.57,Quito,Pichincha,B,6
3054346,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,47.57,Quito,Pichincha,B,6


In [6]:
# Renaming the 'dcoilwtico' column to 'oilprice'
combined_df.rename(columns={'dcoilwtico': 'oilprice'}, inplace=True)

# Confirming the column is renamed
combined_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,oilprice,city,state,type_stores,cluster
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,93.14,Quito,Pichincha,D,13
1,1,2013-01-01,1,BABY CARE,0.0,0,93.14,Quito,Pichincha,D,13
2,2,2013-01-01,1,BEAUTY,0.0,0,93.14,Quito,Pichincha,D,13
3,3,2013-01-01,1,BEVERAGES,0.0,0,93.14,Quito,Pichincha,D,13
4,4,2013-01-01,1,BOOKS,0.0,0,93.14,Quito,Pichincha,D,13


In [7]:
combined_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,oilprice,city,state,type_stores,cluster
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,93.14,Quito,Pichincha,D,13
1,1,2013-01-01,1,BABY CARE,0.000,0,93.14,Quito,Pichincha,D,13
2,2,2013-01-01,1,BEAUTY,0.000,0,93.14,Quito,Pichincha,D,13
3,3,2013-01-01,1,BEVERAGES,0.000,0,93.14,Quito,Pichincha,D,13
4,4,2013-01-01,1,BOOKS,0.000,0,93.14,Quito,Pichincha,D,13
...,...,...,...,...,...,...,...,...,...,...,...
3054343,3000883,2017-08-15,9,POULTRY,438.133,0,47.57,Quito,Pichincha,B,6
3054344,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,47.57,Quito,Pichincha,B,6
3054345,3000885,2017-08-15,9,PRODUCE,2419.729,148,47.57,Quito,Pichincha,B,6
3054346,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,47.57,Quito,Pichincha,B,6


In [23]:
# Step 1: Create a DataFrame with unique combinations of date, city, state
weather_query_df = combined_df[['date', 'city', 'state']].drop_duplicates()
weather_query_df

Unnamed: 0,date,city,state
0,2013-01-01,Quito,Pichincha
66,2013-01-01,Cayambe,Pichincha
99,2013-01-01,Latacunga,Cotopaxi
165,2013-01-01,Riobamba,Chimborazo
198,2013-01-01,Ibarra,Imbabura
...,...,...,...
3053589,2017-08-15,Loja,Loja
3053688,2017-08-15,Machala,El Oro
3053787,2017-08-15,Esmeraldas,Esmeraldas
3054117,2017-08-15,Manta,Manabi


In [9]:
import requests

In [21]:
# Step 2: Define a function to make the API call for weather data

def get_weather_data(date, city, state, api_key):
    base_url = "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/forecast"
    params = {
        'locations': f"{city},{state},EC",  # EC for Ecuador
        'aggregateHours': '24',
        'unitGroup': 'metric',  # Use 'metric' or 'us' depending on the desired units
        'shortColumnNames': 'false',
        'contentType': 'json',  # Use 'json' for easier parsing
        'key': api_key
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json()
        location_key = next(iter(data['locations']))  # Get the first location key
        day_data = data['locations'][location_key]['values'][0]
        
        # Include 'city' and 'state' in the returned dictionary, remove specific event filtering
        return {
            'date': date,  # Including the 'date' as well
            'city': city,  # Adding 'city'
            'state': state,  # Adding 'state'
            'temp': day_data['temp'],
            'precip': day_data['precip'],
            'windspeed': day_data['wspd'],
            'cloudcover': day_data['cloudcover'],
            'visibility': day_data['visibility'],
            'events': ','.join(event['description'] for event in day_data.get('events', []))  # All events
        }
    else:
        return {
            'date': date,
            'city': city,
            'state': state,
            'temp': None,
            'precip': None,
            'windspeed': None,
            'cloudcover': None,
            'visibility': None,
            'events': None
        }

In [22]:
# Step 3: Iterate over weather_query_df to make API calls and store results in a new DataFrame
api_key = 'UE39ZM2RD4V87RT2L6AGAJK3C'
weather_data_list = []

In [25]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.exceptions import RequestException
from time import sleep

# Function to safely make API request with error handling
def safe_get_weather_data(date, city, state, api_key):
    try:
        return get_weather_data(date, city, state, api_key)
    except RequestException as e:
        print(f"Request failed for {date}, {city}, {state}: {e}")
        return None

# Define the number of threads based on your system and API's rate limit
num_threads = 10  # Adjust this number based on your system's capability and API's rate limit

weather_data_list = []

# Using ThreadPoolExecutor to make requests in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    future_to_row = {executor.submit(safe_get_weather_data, row['date'], row['city'], row['state'], api_key): row for index, row in weather_query_df.iterrows()}

    for future in as_completed(future_to_row):
        row = future_to_row[future]
        try:
            weather_data = future.result()
            # Only append if the call was successful (weather_data is not None)
            if weather_data is not None:
                weather_data_list.append(weather_data)
        except Exception as exc:
            print(f"Generated an exception for {row['date']}, {row['city']}, {row['state']}: {exc}")

# Convert the list of dictionaries to a DataFrame
weather_data_df = pd.DataFrame(weather_data_list)

weather_data_df

Unnamed: 0,date,city,state,temp,precip,windspeed,cloudcover,visibility,events
0,2013-01-01,Cayambe,Pichincha,12.1,13.8,6.2,91.9,0.6,
1,2013-01-01,Ambato,Tungurahua,8.6,9.8,5.6,92.6,0.2,
2,2013-01-01,Quevedo,Los Rios,25.7,30.6,5.5,90.9,3.4,
3,2013-01-01,Guayaquil,Guayas,25.8,14.1,5.9,88.4,4.5,
4,2013-01-01,Daule,Guayas,25.7,23.3,4.4,90.0,3.6,
...,...,...,...,...,...,...,...,...,...
37043,2017-08-15,Loja,Loja,16.6,8.6,4.9,73.8,3.2,
37044,2017-08-15,Machala,El Oro,26.3,1.1,8.4,98.9,24.1,
37045,2017-08-15,Esmeraldas,Esmeraldas,26.1,4.7,7.0,95.7,6.6,
37046,2017-08-15,Manta,Manabi,25.4,0.2,7.9,93.0,24.1,


In [26]:
# Exporting the DataFrame to CSV
csv_file_path = 'weather_data.csv'  # Change to your desired path
weather_data_df.to_csv(csv_file_path, index=False)

In [None]:
# for index, row in weather_query_df.iterrows():
#     weather_data = get_weather_data(row['date'], row['city'], row['state'], api_key)
#     weather_data['date'] = row['date']
    
#     weather_data_list.append(weather_data)

# weather_data_df = pd.DataFrame(weather_data_list)

# # Exporting weather_data_df to a CSV file
# weather_data_df.to_csv(csv_file_path, index=False)

# weather_data_df

In [27]:
# Step 4: Merge the weather data back into the original combined_df

combined_df_with_weather = pd.merge(combined_df, weather_data_df, on=['date', 'city', 'state'], how='left')


In [31]:
# Drop the 'events' column from the DataFrame
combined_df_with_weather = combined_df_with_weather.drop('events', axis=1)

In [32]:
combined_df_with_weather

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,oilprice,city,state,type_stores,cluster,temp,precip,windspeed,cloudcover,visibility
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,93.14,Quito,Pichincha,D,13,12.6,11.4,2.9,93.0,1.2
1,1,2013-01-01,1,BABY CARE,0.000,0,93.14,Quito,Pichincha,D,13,12.6,11.4,2.9,93.0,1.2
2,2,2013-01-01,1,BEAUTY,0.000,0,93.14,Quito,Pichincha,D,13,12.6,11.4,2.9,93.0,1.2
3,3,2013-01-01,1,BEVERAGES,0.000,0,93.14,Quito,Pichincha,D,13,12.6,11.4,2.9,93.0,1.2
4,4,2013-01-01,1,BOOKS,0.000,0,93.14,Quito,Pichincha,D,13,12.6,11.4,2.9,93.0,1.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3054343,3000883,2017-08-15,9,POULTRY,438.133,0,47.57,Quito,Pichincha,B,6,13.1,14.5,3.0,89.2,1.0
3054344,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,47.57,Quito,Pichincha,B,6,13.1,14.5,3.0,89.2,1.0
3054345,3000885,2017-08-15,9,PRODUCE,2419.729,148,47.57,Quito,Pichincha,B,6,13.1,14.5,3.0,89.2,1.0
3054346,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,47.57,Quito,Pichincha,B,6,13.1,14.5,3.0,89.2,1.0


In [33]:
# Exporting the DataFrame to CSV
combined_csv_file_path = 'store_and_weather_data.csv' 
combined_df_with_weather.to_csv(combined_csv_file_path, index=False)

In [34]:
stockprices = pd.read_excel('SLU-StockPrice.xlsx', usecols=['Date','Last Price','Volume'])

In [35]:
stockprices

Unnamed: 0,Date,Last Price,Volume
0,2024-03-28,1.590,145134
1,2024-03-27,1.580,39757
2,2024-03-26,1.580,81600
3,2024-03-25,1.570,11693
4,2024-03-22,1.570,56289
...,...,...,...
2866,2012-04-09,2.110,4339
2867,2012-04-04,2.125,9542
2868,2012-04-03,2.145,4317
2869,2012-04-02,2.151,92702


In [42]:
# Convert the 'date' column in combined_df_with_weather to datetime
combined_df_with_weather['date'] = pd.to_datetime(combined_df_with_weather['date'])

# Assuming stockprices is already loaded and contains the 'Date' column to be renamed to 'date'
# Convert the 'Date' column in stockprices to datetime and rename it
# stockprices.rename(columns={'Date': 'date'}, inplace=True)

stockprices['date'] = pd.to_datetime(stockprices['date'])


# Now, both 'date' columns should be datetime64[ns] type, and you can perform the merge
sales_weather_stock = pd.merge(combined_df_with_weather, stockprices, on='date', how='inner')
sales_weather_stock

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,oilprice,city,state,type_stores,cluster,temp,precip,windspeed,cloudcover,visibility,Last Price,Volume
0,3564,2013-01-03,1,AUTOMOTIVE,3.000,0,92.97,Quito,Pichincha,D,13,12.6,11.4,2.9,93.0,1.2,2.029,16345
1,3565,2013-01-03,1,BABY CARE,0.000,0,92.97,Quito,Pichincha,D,13,12.6,11.4,2.9,93.0,1.2,2.029,16345
2,3566,2013-01-03,1,BEAUTY,0.000,0,92.97,Quito,Pichincha,D,13,12.6,11.4,2.9,93.0,1.2,2.029,16345
3,3567,2013-01-03,1,BEVERAGES,919.000,0,92.97,Quito,Pichincha,D,13,12.6,11.4,2.9,93.0,1.2,2.029,16345
4,3568,2013-01-03,1,BOOKS,0.000,0,92.97,Quito,Pichincha,D,13,12.6,11.4,2.9,93.0,1.2,2.029,16345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017219,3000883,2017-08-15,9,POULTRY,438.133,0,47.57,Quito,Pichincha,B,6,13.1,14.5,3.0,89.2,1.0,1.665,29931
2017220,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,47.57,Quito,Pichincha,B,6,13.1,14.5,3.0,89.2,1.0,1.665,29931
2017221,3000885,2017-08-15,9,PRODUCE,2419.729,148,47.57,Quito,Pichincha,B,6,13.1,14.5,3.0,89.2,1.0,1.665,29931
2017222,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,47.57,Quito,Pichincha,B,6,13.1,14.5,3.0,89.2,1.0,1.665,29931


In [54]:
# Exporting the DataFrame to CSV
final_csv_file_path = 'sales_weather_stock.csv' 
sales_weather_stock.to_csv(final_csv_file_path, index=False)

In [43]:
# Get a list of columns for the sales_weather_stock DataFrame
list_of_columns = sales_weather_stock.columns.tolist()

# Now you can print or return this list
print(list_of_columns)

['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion', 'oilprice', 'city', 'state', 'type_stores', 'cluster', 'temp', 'precip', 'windspeed', 'cloudcover', 'visibility', 'Last Price', 'Volume']


In [45]:
pip install geopy

Defaulting to user installation because normal site-packages is not writeable
Collecting geopy
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
   ---------------------------------------- 0.0/125.4 kB ? eta -:--:--
   --- ------------------------------------ 10.2/125.4 kB ? eta -:--:--
   ---------------------------------------  122.9/125.4 kB 2.4 MB/s eta 0:00:01
   ---------------------------------------- 125.4/125.4 kB 1.5 MB/s eta 0:00:00
Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
   ---------------------------------------- 0.0/40.3 kB ? eta -:--:--
   ---------------------------------------- 40.3/40.3 kB 2.0 MB/s eta 0:00:00
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.1
Note: you may need to restart the kernel to use updated packages.


In [49]:
pip install geopandas

Defaulting to user installation because normal site-packages is not writeable
Collecting geopandas
  Downloading geopandas-0.14.3-py3-none-any.whl.metadata (1.5 kB)
Collecting fiona>=1.8.21 (from geopandas)
  Downloading fiona-1.9.6-cp312-cp312-win_amd64.whl.metadata (51 kB)
     ---------------------------------------- 0.0/51.5 kB ? eta -:--:--
     ------- -------------------------------- 10.2/51.5 kB ? eta -:--:--
     -------------------------------------  51.2/51.5 kB 660.6 kB/s eta 0:00:01
     -------------------------------------- 51.5/51.5 kB 529.4 kB/s eta 0:00:00
Collecting click-plugins>=1.0 (from fiona>=1.8.21->geopandas)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Collecting cligj>=0.5 (from fiona>=1.8.21->geopandas)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Downloading geopandas-0.14.3-py3-none-any.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ----------- ---------------------------- 0



In [51]:
pip install mgwr

Defaulting to user installation because normal site-packages is not writeable
Collecting mgwr
  Downloading mgwr-2.2.1-py3-none-any.whl.metadata (1.5 kB)
Collecting libpysal>=4.0.0 (from mgwr)
  Downloading libpysal-4.10-py3-none-any.whl.metadata (4.8 kB)
Collecting spglm>=1.0.6 (from mgwr)
  Downloading spglm-1.1.0-py3-none-any.whl.metadata (3.9 kB)
Collecting spreg (from mgwr)
  Downloading spreg-1.4.2-py3-none-any.whl.metadata (1.7 kB)
Collecting beautifulsoup4>=4.10 (from libpysal>=4.0.0->mgwr)
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4>=4.10->libpysal>=4.0.0->mgwr)
  Downloading soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)
Downloading mgwr-2.2.1-py3-none-any.whl (47 kB)
   ---------------------------------------- 0.0/47.9 kB ? eta -:--:--
   ---------------------------------------- 47.9/47.9 kB 1.2 MB/s eta 0:00:00
Downloading libpysal-4.10-py3-none-any.whl (2.8 MB)
   -----------------------------------

In [53]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Assuming 'sales_weather_stock' is your dataframe
# Select the relevant features and target variable
features = ['store_nbr', 'onpromotion', 'oilprice', 'temp', 'precip', 'windspeed', 'cloudcover', 'visibility', 'Last Price', 'Volume']
target = 'sales'

# Split the data into features (X) and target (y)
X = sales_weather_stock[features]
y = sales_weather_stock[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Get the feature importance
importance = rf.feature_importances_

# Create a DataFrame with feature names and their importance
feature_importance = pd.DataFrame({'Feature': features, 'Importance': importance})

# Sort the feature importance in descending order
feature_importance = feature_importance.sort_values('Importance', ascending=False)

# Print the feature importance
print(feature_importance)

KeyboardInterrupt: 