In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import zipfile

# Load dataset
df = pd.read_csv('REAL_DATA_LABELS.csv')

# Display first few rows
print(df.head())

# Data Exploration
print(df.info())
print(df.describe())

# Data Preprocessing

# Drop redundant or non-informative columns
df.drop(['index', 'date'], axis=1, inplace=True)

# Handle missing values
df = df.dropna()  # Alternatively, use df.fillna(method='ffill')

# Encode categorical variables
categorical_cols = ['state_holiday', 'store_ID_x', 'store_ID_y', 'day_of_week']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Separate features and target
X = df.drop('sales', axis=1)
y = df['sales']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost Model
model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate Model
r2 = r2_score(y_test, predictions)
print(f"R2 Score: {r2:.4f}")

# Prepare Deliverables

# Create CSV with predictions
df['predicted_sales'] = model.predict(X)
df.to_csv('G1.csv', index=False) 

# Save R2 score to txt file
with open('R2_score.txt', 'w') as f:
    f.write(f"R2 Score: {r2:.4f}")

# Create ZIP file with the CSV and R2 score
with zipfile.ZipFile('IronKaggle_Submission.zip', 'w') as zipf:
    zipf.write('G1.csv')
    zipf.write('R2_score.txt')

print("Deliverables created and zipped successfully.")


    index  store_ID_x  store_ID_y  day_of_week        date  \
0  272371         415         415            7  01/03/2015   
1  558468          27          27            7  29/12/2013   
2   76950         404         404            3  19/03/2014   
3   77556         683         683            2  29/01/2013   
4  456344         920         920            3  19/03/2014   

   nb_customers_on_day  open  promotion state_holiday  school_holiday  sales  
0                    0     0          0             0               0      0  
1                    0     0          0             0               0      0  
2                  657     1          1             0               0   5483  
3                  862     1          0             0               0   9325  
4                  591     1          1             0               0   5402  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71205 entries, 0 to 71204
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dt