In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OrdinalEncoder 
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('day_cat.csv')

In [3]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,spring,0,1,0,saturday,0,mist,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,spring,0,1,0,sunday,0,mist,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,spring,0,1,0,monday,1,clear,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,spring,0,1,0,tuesday,1,clear,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,spring,0,1,0,wednesday,1,clear,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [4]:
df=df.drop(labels=['dteday','instant','atemp','casual','registered'],axis=1)
df.head()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,hum,windspeed,cnt
0,spring,0,1,0,saturday,0,mist,0.344167,0.805833,0.160446,985
1,spring,0,1,0,sunday,0,mist,0.363478,0.696087,0.248539,801
2,spring,0,1,0,monday,1,clear,0.196364,0.437273,0.248309,1349
3,spring,0,1,0,tuesday,1,clear,0.2,0.590435,0.160296,1562
4,spring,0,1,0,wednesday,1,clear,0.226957,0.436957,0.1869,1600


In [5]:
X = df.drop(labels=['cnt'],axis=1)
Y = df[['cnt']]

In [6]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [7]:
categorical_cols

Index(['season', 'weekday', 'weathersit'], dtype='object')

In [8]:
numerical_cols

Index(['yr', 'mnth', 'holiday', 'workingday', 'temp', 'hum', 'windspeed'], dtype='object')

In [9]:
season_map = {'spring':1, 'summer':2, 'fall':3, 'winter':4}
week_map = {'saturday':6, 'sunday':0, 'monday':1, 'tuesday':2, 'wednesday':3, 'thursday':4,
       'friday':5}
weather_map={'mist':2, 'clear':1, 'light_snow':3}

In [10]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical Pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(categories=[season_map, week_map, weather_map]))
    # No scaler here since OrdinalEncoder outputs are already numerical and scaled
])

# Preprocessor Pipeline
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])


In [16]:
df['weathersit'].unique()

array(['mist', 'clear', 'light_snow'], dtype=object)

In [11]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [12]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed