In [1]:
import pandas as pd

# Load the datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
stores = pd.read_csv('stores.csv')
oil = pd.read_csv('oil.csv')
holidays_events = pd.read_csv('holidays_events.csv')

In [2]:
stores

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4
5,6,Quito,Pichincha,D,13
6,7,Quito,Pichincha,D,8
7,8,Quito,Pichincha,D,8
8,9,Quito,Pichincha,B,6
9,10,Quito,Pichincha,C,15


In [3]:
oil

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.20
...,...,...
1213,2017-08-25,47.65
1214,2017-08-28,46.40
1215,2017-08-29,46.46
1216,2017-08-30,45.96


In [4]:
holidays_events

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
...,...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,Navidad-3,False
346,2017-12-23,Additional,National,Ecuador,Navidad-2,False
347,2017-12-24,Additional,National,Ecuador,Navidad-1,False
348,2017-12-25,Holiday,National,Ecuador,Navidad,False


In [5]:
print(stores.head())

   store_nbr           city                           state type  cluster
0          1          Quito                       Pichincha    D       13
1          2          Quito                       Pichincha    D       13
2          3          Quito                       Pichincha    D        8
3          4          Quito                       Pichincha    D        9
4          5  Santo Domingo  Santo Domingo de los Tsachilas    D        4


In [11]:
## Check for missing values

In [9]:
print(stores.isnull().sum())

store_nbr    0
city         0
state        0
type         0
cluster      0
dtype: int64


In [6]:
print(stores.tail())

    store_nbr       city       state type  cluster
49         50     Ambato  Tungurahua    A       14
50         51  Guayaquil      Guayas    A       17
51         52      Manta      Manabi    A       11
52         53      Manta      Manabi    D       13
53         54  El Carmen      Manabi    C        3


In [18]:
print(oil.isnull().sum())

date           0
dcoilwtico    43
dtype: int64


In [10]:
print(holidays_events.isnull().sum())

date           0
type           0
locale         0
locale_name    0
description    0
transferred    0
dtype: int64


In [14]:
# Check for missing values
print(train.isnull().sum())
print(test.isnull().sum())

id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64
id             0
date           0
store_nbr      0
family         0
onpromotion    0
dtype: int64


In [12]:
# Convert date columns to datetime format
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
oil['date'] = pd.to_datetime(oil['date'])
holidays_events['date'] = pd.to_datetime(holidays_events['date'])


# Handle missing values (if any).

In [26]:
# Fill missing oil prices using forward fill
oil['dcoilwtico'] = oil['dcoilwtico'].fillna(method='ffill')

# Check if it worked by displaying the first few rows of the oil DataFrame
print(oil.head())

        date  dcoilwtico
0 2013-01-01         NaN
1 2013-01-02       93.14
2 2013-01-03       92.97
3 2013-01-04       93.12
4 2013-01-07       93.20


In [25]:
# Check the columns in the oil DataFrame
print(oil.columns)

Index(['date', 'dcoilwtico'], dtype='object')


### Merge external data sources (oil and holidays) with the main dataset.

In [28]:
# Merge oil prices with train and test datasets based on the 'date'
train = pd.merge(train, oil[['date', 'dcoilwtico']], on='date', how='left')
test = pd.merge(test, oil[['date', 'dcoilwtico']], on='date', how='left')

In [29]:
# Display the first few rows of the merged train and test data to verify the oil price column is added
print(train.head())
print(test.head())

   id       date  store_nbr      family  sales  onpromotion  dcoilwtico
0   0 2013-01-01          1  AUTOMOTIVE    0.0            0         NaN
1   1 2013-01-01          1   BABY CARE    0.0            0         NaN
2   2 2013-01-01          1      BEAUTY    0.0            0         NaN
3   3 2013-01-01          1   BEVERAGES    0.0            0         NaN
4   4 2013-01-01          1       BOOKS    0.0            0         NaN
        id       date  store_nbr      family  onpromotion  dcoilwtico
0  3000888 2017-08-16          1  AUTOMOTIVE            0        46.8
1  3000889 2017-08-16          1   BABY CARE            0        46.8
2  3000890 2017-08-16          1      BEAUTY            2        46.8
3  3000891 2017-08-16          1   BEVERAGES           20        46.8
4  3000892 2017-08-16          1       BOOKS            0        46.8


In [30]:
# Forward fill missing oil prices
oil['dcoilwtico'] = oil['dcoilwtico'].fillna(method='ffill')

# Verify the first few rows after forward filling
print(oil.head())

        date  dcoilwtico
0 2013-01-01         NaN
1 2013-01-02       93.14
2 2013-01-03       92.97
3 2013-01-04       93.12
4 2013-01-07       93.20


In [31]:
# Merge the oil price data with the train and test datasets based on 'date'
train = pd.merge(train, oil[['date', 'dcoilwtico']], on='date', how='left')
test = pd.merge(test, oil[['date', 'dcoilwtico']], on='date', how='left')

# Verify the merged train and test datasets
print(train.head())
print(test.head())

   id       date  store_nbr      family  sales  onpromotion  dcoilwtico_x  \
0   0 2013-01-01          1  AUTOMOTIVE    0.0            0           NaN   
1   1 2013-01-01          1   BABY CARE    0.0            0           NaN   
2   2 2013-01-01          1      BEAUTY    0.0            0           NaN   
3   3 2013-01-01          1   BEVERAGES    0.0            0           NaN   
4   4 2013-01-01          1       BOOKS    0.0            0           NaN   

   dcoilwtico_y  
0           NaN  
1           NaN  
2           NaN  
3           NaN  
4           NaN  
        id       date  store_nbr      family  onpromotion  dcoilwtico_x  \
0  3000888 2017-08-16          1  AUTOMOTIVE            0          46.8   
1  3000889 2017-08-16          1   BABY CARE            0          46.8   
2  3000890 2017-08-16          1      BEAUTY            2          46.8   
3  3000891 2017-08-16          1   BEVERAGES           20          46.8   
4  3000892 2017-08-16          1       BOOKS          

In [33]:
# Check the columns in the train and test datasets after merging
print(train.columns)
print(test.columns)

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion',
       'dcoilwtico_x', 'dcoilwtico_y'],
      dtype='object')
Index(['id', 'date', 'store_nbr', 'family', 'onpromotion', 'dcoilwtico_x',
       'dcoilwtico_y'],
      dtype='object')


In [34]:
# Rename 'dcoilwtico_y' to 'dcoilwtico' and drop 'dcoilwtico_x'
train = train.rename(columns={'dcoilwtico_y': 'dcoilwtico'}).drop(columns=['dcoilwtico_x'])
test = test.rename(columns={'dcoilwtico_y': 'dcoilwtico'}).drop(columns=['dcoilwtico_x'])

# Verify the columns now
print(train.columns)
print(test.columns)

# Check the first few rows to confirm
print(train[['date', 'store_nbr', 'family', 'sales', 'dcoilwtico']].head())
print(test[['date', 'store_nbr', 'family', 'dcoilwtico']].head())

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion',
       'dcoilwtico'],
      dtype='object')
Index(['id', 'date', 'store_nbr', 'family', 'onpromotion', 'dcoilwtico'], dtype='object')
        date  store_nbr      family  sales  dcoilwtico
0 2013-01-01          1  AUTOMOTIVE    0.0         NaN
1 2013-01-01          1   BABY CARE    0.0         NaN
2 2013-01-01          1      BEAUTY    0.0         NaN
3 2013-01-01          1   BEVERAGES    0.0         NaN
4 2013-01-01          1       BOOKS    0.0         NaN
        date  store_nbr      family  dcoilwtico
0 2017-08-16          1  AUTOMOTIVE        46.8
1 2017-08-16          1   BABY CARE        46.8
2 2017-08-16          1      BEAUTY        46.8
3 2017-08-16          1   BEVERAGES        46.8
4 2017-08-16          1       BOOKS        46.8


In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
# --- Day 1: Data Processing and Feature Engineering ---

# 1. Data Loading and Initial Exploration
train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv('test.csv', parse_dates=['date'])
stores = pd.read_csv('stores.csv')
oil = pd.read_csv('oil.csv', parse_dates=['date'])
holidays = pd.read_csv('holidays_events.csv', parse_dates=['date'])


In [78]:
# 2. Data Cleaning and Transformation
oil['dcoilwtico'] = oil['dcoilwtico'].fillna(method='ffill')

holidays['transferred'] = holidays['transferred'].fillna(False)
holidays['locale'] = holidays['locale'].fillna('National')
holidays['locale_name'] = holidays['locale_name'].fillna('Ecuador')

train = pd.merge(train, stores, on='store_nbr', how='left')
test = pd.merge(test, stores, on='store_nbr', how='left')

train = pd.merge(train, oil, on='date', how='left')
test = pd.merge(test, oil, on='date', how='left')

train = pd.merge(train, holidays, on='date', how='left', suffixes=('', '_holiday'))
test = pd.merge(test, holidays, on='date', how='left', suffixes=('', '_holiday'))



def feature_engineering(df):
    df['dayofweek'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)

    df['onpromotion'] = df['onpromotion'].astype(int)
    df['dcoilwtico'] = df['dcoilwtico'].fillna(method='ffill')

    df['holiday'] = (~df['type_holiday'].isnull()).astype(int)

    for col in ['type_holiday', 'locale', 'locale_name', 'type']:
        df[col] = df[col].fillna('None')

    for col in ['type_holiday', 'locale', 'locale_name', 'type', 'family', 'city', 'state', 'cluster']:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

    for i in range(1, 8):
        df[f'sales_lag_{i}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(i)
        df[f'oil_lag_{i}'] = df['dcoilwtico'].shift(i)

    df['sales_rolling_mean_7'] = df.groupby(['store_nbr', 'family'])['sales'].transform(lambda x: x.rolling(window=7).mean())
    df['oil_rolling_mean_7'] = df['dcoilwtico'].rolling(window=7).mean()
    df = df.fillna(0) # handle remaining NaNs from lag/rolling features.

    return df

train = feature_engineering(train)
test = feature_engineering(test)


In [41]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def feature_engineering(df, is_train=True): #added is_train parameter
    df['dayofweek'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)

    df['onpromotion'] = df['onpromotion'].astype(int)
    df['dcoilwtico'] = df['dcoilwtico'].fillna(method='ffill')

    df['holiday'] = (~df['type_holiday'].isnull()).astype(int)

    for col in ['type_holiday', 'locale', 'locale_name', 'type']:
        df[col] = df[col].fillna('None')

    for col in ['type_holiday', 'locale', 'locale_name', 'type', 'family', 'city', 'state', 'cluster']:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

    for i in range(1, 8):
        df[f'oil_lag_{i}'] = df['dcoilwtico'].shift(i)

    df['oil_rolling_mean_7'] = df['dcoilwtico'].rolling(window=7).mean()

    if is_train: #add conditional logic based on is_train parameter
        for i in range(1, 8):
            df[f'sales_lag_{i}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(i)

        df['sales_rolling_mean_7'] = df.groupby(['store_nbr', 'family'])['sales'].transform(lambda x: x.rolling(window=7).mean())

    df = df.fillna(0) # handle remaining NaNs from lag/rolling features.

    return df

train = feature_engineering(train)
test = feature_engineering(test, is_train=False) #pass is_train=False for test dataframe.


In [42]:
train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,...,sales_lag_4,oil_lag_4,sales_lag_5,oil_lag_5,sales_lag_6,oil_lag_6,sales_lag_7,oil_lag_7,sales_rolling_mean_7,oil_rolling_mean_7
0,0,2013-01-01,1,0,0.000,0,12,2,3,10,...,0.000,0.00,0.00000,0.00,0.000,0.00,0.000,0.00,0.000000,0.00
1,1,2013-01-01,1,1,0.000,0,12,2,3,10,...,0.000,0.00,0.00000,0.00,0.000,0.00,0.000,0.00,0.000000,0.00
2,2,2013-01-01,1,27,0.000,0,12,2,3,10,...,0.000,0.00,0.00000,0.00,0.000,0.00,0.000,0.00,0.000000,0.00
3,3,2013-01-01,1,8,0.000,0,12,2,3,10,...,0.000,0.00,0.00000,0.00,0.000,0.00,0.000,0.00,0.000000,0.00
4,4,2013-01-01,1,13,0.000,0,12,2,3,10,...,0.000,0.00,0.00000,0.00,0.000,0.00,0.000,0.00,0.000000,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3054343,3000883,2017-08-15,9,6,438.133,0,12,2,1,4,...,525.224,47.57,291.82098,47.57,333.132,47.57,358.132,47.57,381.083426,47.57
3054344,3000884,2017-08-15,9,7,154.553,1,12,2,1,4,...,112.100,47.57,111.93000,47.57,123.465,47.57,112.954,47.57,121.605856,47.57
3054345,3000885,2017-08-15,9,9,2419.729,148,12,2,1,4,...,1453.078,47.57,1036.43900,47.57,1310.448,47.57,2299.715,47.57,1525.855714,47.57
3054346,3000886,2017-08-15,9,10,121.000,8,12,2,1,4,...,140.000,47.57,148.00000,47.57,83.000,47.57,170.000,47.57,144.571429,47.57


In [43]:
test

Unnamed: 0,id,date,store_nbr,family,onpromotion,city,state,type,cluster,dcoilwtico,...,weekofyear,holiday,oil_lag_1,oil_lag_2,oil_lag_3,oil_lag_4,oil_lag_5,oil_lag_6,oil_lag_7,oil_rolling_mean_7
0,3000888,2017-08-16,1,0,0,12,2,3,10,46.80,...,33,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,3000889,2017-08-16,1,1,0,12,2,3,10,46.80,...,33,1,46.80,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,3000890,2017-08-16,1,27,2,12,2,3,10,46.80,...,33,1,46.80,46.80,0.00,0.00,0.00,0.00,0.00,0.00
3,3000891,2017-08-16,1,8,20,12,2,3,10,46.80,...,33,1,46.80,46.80,46.80,0.00,0.00,0.00,0.00,0.00
4,3000892,2017-08-16,1,13,0,12,2,3,10,46.80,...,33,1,46.80,46.80,46.80,46.80,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28507,3029395,2017-08-31,9,6,1,12,2,1,4,47.26,...,35,1,47.26,47.26,47.26,47.26,47.26,47.26,47.26,47.26
28508,3029396,2017-08-31,9,7,0,12,2,1,4,47.26,...,35,1,47.26,47.26,47.26,47.26,47.26,47.26,47.26,47.26
28509,3029397,2017-08-31,9,9,1,12,2,1,4,47.26,...,35,1,47.26,47.26,47.26,47.26,47.26,47.26,47.26,47.26
28510,3029398,2017-08-31,9,10,9,12,2,1,4,47.26,...,35,1,47.26,47.26,47.26,47.26,47.26,47.26,47.26,47.26


In [45]:
# --- Model Training and Forecasting (LightGBM) ---
features = [col for col in train.columns if col not in ['id', 'date', 'sales']]
features_test = [col for col in test.columns if col not in ['id', 'date']] #create feature list for test dataframe.

target = 'sales'
train_data = train[train['date'] < '2017-08-01']
val_data = train[train['date'] >= '2017-08-01']
X_train, y_train = train_data[features], train_data[target]
X_val, y_val = val_data[features], val_data[target]
X_test = test[features_test] #use features_test here.

In [49]:
pip install lightgbm

Collecting lightgbmNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/5e/23/f8b28ca248bb629b9e08f877dd2965d1994e1674a03d67cd10c5246da248/lightgbm-4.6.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.5 MB 812.7 kB/s eta 0:00:02
   ------ --------------------------------- 0.2/1.5 MB 2.0 MB/s eta 0:00:01
   ---------- ----------------------------- 0.4/1.5 MB 2.4 MB/s eta 0:00:01
   ---------------- ----------------------- 0.6/1.5 MB 3.0 MB/s eta 0:00:01
   ----------------------- ---------------- 0.9/1.5 MB 3.2 MB/s eta 0:00:01
   ---------------------------- ----------- 1.0/1.5 MB 3.5 

In [50]:
pip install --user lightgbm

Note: you may need to restart the kernel to use updated packages.


In [51]:
import lightgbm as lgb
print(lgb.__version__)

4.6.0


In [57]:
import lightgbm as lgb
print(lgb.__version__)

import pandas as pd
import numpy as np

lgb_train = lgb.Dataset(pd.DataFrame({'a':[1,2,3]}), pd.Series([1,2,3]))
lgb_val = lgb.Dataset(pd.DataFrame({'a':[1,2,3]}), pd.Series([1,2,3]), reference=lgb_train)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

model = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=[lgb_train, lgb_val],)

4.6.0


In [62]:
import sys
print(sys.executable)

C:\Users\Admin\.conda\python\python.exe


In [64]:
!pip uninstall lightgbm -y # The -y skips the confirmation prompt.

ERROR: Invalid requirement: '#'


In [65]:
pip install lightgbm==3.3.2 # Or any version above 3.0

Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: '#'


In [66]:
!pip install lightgbm==3.3.2

Collecting lightgbm==3.3.2
  Obtaining dependency information for lightgbm==3.3.2 from https://files.pythonhosted.org/packages/ba/24/2f83a1008c8add8cd9da03163f911be6b555eb2b9166b5ab74e1ad63ff40/lightgbm-3.3.2-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-3.3.2-py3-none-win_amd64.whl.metadata (15 kB)
Downloading lightgbm-3.3.2-py3-none-win_amd64.whl (1.0 MB)
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
    --------------------------------------- 0.0/1.0 MB 640.0 kB/s eta 0:00:02
   -- ------------------------------------- 0.1/1.0 MB 660.6 kB/s eta 0:00:02
   -- ------------------------------------- 0.1/1.0 MB 660.6 kB/s eta 0:00:02
   -- ------------------------------------- 0.1/1.0 MB 660.6 kB/s eta 0:00:02
   -- ------------------------------------- 0.1/1.0 MB 302.7 kB/s eta 0:00:04
   -- ------------------------------------- 0.1/1.0 MB 302.7 kB/s eta 0:00:04
   -- ------------------------------------- 0.1/1.0 MB 302.7 kB/s eta 0:00:04
   -- ----------

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Admin\\.conda\\python\\Lib\\site-packages\\~ightgbm\\bin\\lib_lightgbm.dll'
Consider using the `--user` option or check the permissions.



In [67]:
!pip uninstall lightgbm -y

Found existing installation: lightgbm 3.3.2
Uninstalling lightgbm-3.3.2:
  Successfully uninstalled lightgbm-3.3.2


In [68]:
!conda install -c conda-forge lightgbm -y

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\Admin\.conda\python

  added / updated specs:
    - lightgbm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2025.1.31  |       h56e8100_0         155 KB  conda-forge
    certifi-2025.1.31          |     pyhd8ed1ab_0         159 KB  conda-forge
    imbalanced-learn-0.12.2    |     pyhd8ed1ab_0         155 KB  conda-forge
    lightgbm-3.3.5             |  py311h12c1d0e_0         821 KB  conda-forge
    openssl-3.4.1              |       ha4e3fda_0         8.1 MB  conda-forge
    python_abi-3.11            |          2_cp311           5 KB  conda-forge
    ucrt-10.0.22621.0          |       h57928b3_1         547 KB  conda-forge
    vc14_runtime-14.42.34433   |      h635


The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/win-64::_anaconda_depends==2023.09=py311_mkl_1


  current version: 23.7.4
  latest version: 25.1.1

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=25.1.1



CondaHTTPError: HTTP 000 CONNECTION FAILED for url <https://conda.anaconda.org/conda-forge/win-64/lightgbm-3.3.5-py311h12c1d0e_0.conda>
Elapsed: -

An HTTP error occurred when trying to retrieve this URL.
HTTP errors are often intermittent, and a simple retry will get you on your way.

CancelledError()
CondaHTTPError: HTTP 000 CONNECTION FAILED for url <https://conda.anaconda.org/conda-forge/noarch/imbalanced-learn-0.12.2-pyhd8ed1ab_0.conda>
Elapsed: -

An HTTP error occurred when trying to retrieve this URL.
HTTP errors are often intermittent, and a simple retry wil

In [72]:
test['description'] = test['description'].astype(str)

In [73]:
test['description'] = test['description'].apply(lambda x: str(x) if not isinstance(x, int) else "unknown")

In [74]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# ... (Your code to load and preprocess data) ...

# Handle mixed data in 'description' column
test['description'] = test['description'].apply(lambda x: str(x) if not isinstance(x, int) else "unknown")

# Vectorize the 'description' column
vectorizer = TfidfVectorizer()
description_tfidf = vectorizer.fit_transform(test['description'])
description_df = pd.DataFrame(description_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
test = pd.concat([test.reset_index(drop=True), description_df.reset_index(drop=True)], axis=1)
test = test.drop('description', axis=1)

# ... (Rest of your code) ...

#### XGBoost (eXtreme Gradient Boosting) is a highly efficient and widely popular machine learning algorithm, particularly effective for structured or tabular data. It's a sophisticated implementation of the gradient boosting framework, known for its speed and performance. Here's a breakdown of its key aspects:

# model evalution

In [75]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
import pandas as pd
import numpy as np

# Example: Regression task (predicting a continuous value)
def xgboost_regression(X, y, test_size=0.2, random_state=42):
    """
    Trains and evaluates an XGBoost regression model.

    Args:
        X: Features (pandas DataFrame or NumPy array).
        y: Target variable (pandas Series or NumPy array).
        test_size: Proportion of the dataset to include in the test split.
        random_state: Seed for random number generation.

    Returns:
        A dictionary containing the trained model and the test RMSE.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=random_state) # or reg:linear, reg:logistic for different problems.
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return {'model': model, 'rmse': rmse}

# Example: Classification task (predicting a categorical value)
def xgboost_classification(X, y, test_size=0.2, random_state=42):
    """
    Trains and evaluates an XGBoost classification model.

    Args:
        X: Features (pandas DataFrame or NumPy array).
        y: Target variable (pandas Series or NumPy array).
        test_size: Proportion of the dataset to include in the test split.
        random_state: Seed for random number generation.

    Returns:
        A dictionary containing the trained model and the test accuracy.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model = xgb.XGBClassifier(objective='binary:logistic', random_state=random_state) # or multi:softmax for multiple classes.
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return {'model': model, 'accuracy': accuracy}

# Example usage (replace with your data)
if __name__ == "__main__":
    # Example data (replace with your actual data)
    data = {'feature1': np.random.rand(100),
            'feature2': np.random.rand(100),
            'target_reg': np.random.rand(100),
            'target_class': np.random.randint(0, 2, 100)} #Example of binary classification.
    df = pd.DataFrame(data)

    X = df[['feature1', 'feature2']]
    y_reg = df['target_reg']
    y_class = df['target_class']

    # Regression example
    regression_results = xgboost_regression(X, y_reg)
    print(f"Regression RMSE: {regression_results['rmse']}")

    # Classification example
    classification_results = xgboost_classification(X, y_class)
    print(f"Classification Accuracy: {classification_results['accuracy']}")

    #Accessing the model.
    model_reg = regression_results['model']
    model_class = classification_results['model']

    #Example of making a prediction with the regression model.
    new_data = pd.DataFrame({'feature1': [0.5], 'feature2': [0.6]})
    prediction = model_reg.predict(new_data)
    print(f"Regression prediction: {prediction}")

Regression RMSE: 0.2576162480993028
Classification Accuracy: 0.4
Regression prediction: [0.53344864]


# Random forest 

In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np

# Example: Random Forest Classifier (predicting a categorical value)
def random_forest_classification(X, y, test_size=0.2, random_state=42):
    """
    Trains and evaluates a Random Forest classification model.

    Args:
        X: Features (pandas DataFrame or NumPy array).
        y: Target variable (pandas Series or NumPy array).
        test_size: Proportion of the dataset to include in the test split.
        random_state: Seed for random number generation.

    Returns:
        A dictionary containing the trained model and the test accuracy.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model = RandomForestClassifier(random_state=random_state)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return {'model': model, 'accuracy': accuracy}

# Example: Random Forest Regressor (predicting a continuous value)
def random_forest_regression(X, y, test_size=0.2, random_state=42):
    """
    Trains and evaluates a Random Forest regression model.

    Args:
        X: Features (pandas DataFrame or NumPy array).
        y: Target variable (pandas Series or NumPy array).
        test_size: Proportion of the dataset to include in the test split.
        random_state: Seed for random number generation.

    Returns:
        A dictionary containing the trained model and the test RMSE.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model = RandomForestRegressor(random_state=random_state)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return {'model': model, 'rmse': rmse}

# Example usage (replace with your data)
if __name__ == "__main__":
    # Example data (replace with your actual data)
    data = {'feature1': np.random.rand(100),
            'feature2': np.random.rand(100),
            'target_reg': np.random.rand(100),
            'target_class': np.random.randint(0, 2, 100)}
    df = pd.DataFrame(data)

    X = df[['feature1', 'feature2']]
    y_reg = df['target_reg']
    y_class = df['target_class']

    # Classification example
    classification_results = random_forest_classification(X, y_class)
    print(f"Classification Accuracy: {classification_results['accuracy']}")

    # Regression example
    regression_results = random_forest_regression(X, y_reg)
    print(f"Regression RMSE: {regression_results['rmse']}")

    #Accessing the model.
    model_reg = regression_results['model']
    model_class = classification_results['model']

    #Example of making a prediction with the regression model.
    new_data = pd.DataFrame({'feature1': [0.5], 'feature2': [0.6]})
    prediction = model_reg.predict(new_data)
    print(f"Regression prediction: {prediction}")

Classification Accuracy: 0.4
Regression RMSE: 0.3893908667447124
Regression prediction: [0.62695778]
