In [3]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from datetime import datetime

# Load data
train = pd.read_csv('data/train.csv')

# Define features and target
X = train.drop(['rainfall','id'], axis=1)
y = train['rainfall']



In [5]:
X.info()
#y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2190 entries, 0 to 2189
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   day            2190 non-null   int64  
 1   pressure       2190 non-null   float64
 2   maxtemp        2190 non-null   float64
 3   temparature    2190 non-null   float64
 4   mintemp        2190 non-null   float64
 5   dewpoint       2190 non-null   float64
 6   humidity       2190 non-null   float64
 7   cloud          2190 non-null   float64
 8   sunshine       2190 non-null   float64
 9   winddirection  2190 non-null   float64
 10  windspeed      2190 non-null   float64
dtypes: float64(10), int64(1)
memory usage: 188.3 KB


In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the target variable
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1))

# Define the preprocessing steps
#in this case no categorical columns -> next time removing one hot encoded column to avoid multicollinearity
categorical_cols= X.select_dtypes(include='object').columns.tolist()
numerical_cols= X.select_dtypes(include=['float64','int64']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical_pipeline', Pipeline(steps=[
            ('scaler',StandardScaler())
        ]), numerical_cols),
        ('categorigal_pipeline', Pipeline(steps=[
            ('encoder', OneHotEncoder())
        ]), categorical_cols)
    ],
    remainder='passthrough'
)

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', lgb.LGBMRegressor())
])

# Train the model
pipeline.fit(X_train, y_train_scaled.ravel(), model__eval_metric='rmse')

# Predict on validation set
y_pred_scaled = pipeline.predict(X_val)

# Inverse transform the predictions to get them back to the original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))

# Calculate RMSE
rmse = root_mean_squared_error(y_val, y_pred)
print(f'RMSE: {rmse}')

In [9]:

# Load test data
test = pd.read_csv('data/test.csv')

# Predict num_sold
predictions_scaled = pipeline.predict(test.drop('id', axis=1))

predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()

# Create a DataFrame with the predictions
results = pd.DataFrame({'id': test['id'], 'rainfall': predictions})

# Generate filename with datetime
filename = f'submissions/lightGBM_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'

# Save the results to a CSV file
results.to_csv(filename, index=False)

# Display a message indicating the predictions have been saved
print(f'Predictions saved to {filename}')


Predictions saved to submissions/lightGBM_20250302_095538.csv


