In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Load datasets
train = pd.read_csv('train.csv')
store = pd.read_csv('store.csv')

# Display the first few rows of the train dataset
print(train.head())

# Display the first few rows of the store dataset
print(store.head())



In [None]:
# Data Cleaning

# Merge store data into train data
train = train.merge(store, on='Store')

# Convert Date to datetime
train['Date'] = pd.to_datetime(train['Date'])

# Handle missing values (if any)
print(train.isnull().sum())

# Fill missing values with appropriate methods
train['CompetitionDistance'].fillna(train['CompetitionDistance'].median(), inplace=True)
train.fillna(0, inplace=True)


In [None]:
# Basic Statistics and Visualizations

# Plot sales over time
plt.figure(figsize=(14, 7))
train.groupby('Date')['Sales'].sum().plot()
plt.title('Total Sales over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()

# Distribution of sales
plt.figure(figsize=(10, 6))
sns.histplot(train['Sales'], bins=50, kde=True)
plt.title('Distribution of Sales')
plt.show()


In [None]:
# Exploring Sales by Store
# Plot sales distribution by store
plt.figure(figsize=(14, 7))
sns.boxplot(data=train, x='Store', y='Sales')
plt.title('Sales Distribution by Store')
plt.xticks([], [])
plt.show()


In [None]:
# Creating New Features
# Extract year, month, day, and week of year from Date
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Day'] = train['Date'].dt.day
train['WeekOfYear'] = train['Date'].dt.isocalendar().week

# Encode categorical variables
train = pd.get_dummies(train, columns=['DayOfWeek', 'StoreType', 'Assortment', 'PromoInterval'], drop_first=True)

# Drop columns that are not needed for modeling
train.drop(['Date', 'Customers', 'StateHoliday'], axis=1, inplace=True)

# Display the first few rows of the modified dataset
print(train.head())


In [None]:
# Split Data into Training and Testing Sets
# Define features and target
X = train.drop(['Sales'], axis=1)
y = train['Sales']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes of the training and validation sets
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)


In [None]:
# Train LightGBM Model
# Define LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# Define parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# Train model
model = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[train_data, val_data], early_stopping_rounds=50, verbose_eval=100)

# Display training completion message
print("Model training completed.")


In [None]:
# Evaluate Model Performance
# Predict on validation data
y_pred = model.predict(X_val, num_iteration=model.best_iteration)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE: {rmse:.2f}")

# Plot actual vs predicted sales
plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_pred, alpha=0.3)
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.title('Actual vs Predicted Sales')
plt.show()


In [None]:
# Load Test Data and Make Predictions
# Load test data
test = pd.read_csv('test.csv')

# Merge store data into test data
test = test.merge(store, on='Store')

# Convert Date to datetime
test['Date'] = pd.to_datetime(test['Date'])

# Extract year, month, day, and week of year from Date
test['Year'] = test['Date'].dt.year
test['Month'] = test['Date'].dt.month
test['Day'] = test['Date'].dt.day
test['WeekOfYear'] = test['Date'].dt.isocalendar().week

# Encode categorical variables
test = pd.get_dummies(test, columns=['DayOfWeek', 'StoreType', 'Assortment', 'PromoInterval'], drop_first=True)

# Fill missing values
test['CompetitionDistance'].fillna(test['CompetitionDistance'].median(), inplace=True)
test.fillna(0, inplace=True)

# Drop columns that are not needed for modeling
test.drop(['Date', 'StateHoliday', 'Id'], axis=1, inplace=True)

# Make predictions
predictions = model.predict(test, num_iteration=model.best_iteration)

# Display predictions
print(predictions[:10])


In [None]:
# Visualizing Predictions
# Load sample submission file
submission = pd.read_csv('sample_submission.csv')

# Create submission file
submission['Sales'] = predictions
submission.to_csv('sales_predictions.csv', index=False)

# Plot predictions distribution
plt.figure(figsize=(10, 6))
sns.histplot(submission['Sales'], bins=50, kde=True)
plt.title('Distribution of Predicted Sales')
plt.show()
