In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [2]:
# Load Data
train_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/DS/Dataset/Train.csv")
test_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/DS/Dataset/Test.csv")


In [3]:
# Data Exploration
print(train_df.info())
print(train_df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Holiday             40 non-null     float64
 1   Temperature         30000 non-null  float64
 2   Rainfall_last_hour  30000 non-null  float64
 3   Snowfall_last_hour  30000 non-null  float64
 4   Cloud_Cover         30000 non-null  int64  
 5   Weather             30000 non-null  object 
 6   Weather_Desc        30000 non-null  object 
 7   TimeStamp           30000 non-null  object 
 8   Date                30000 non-null  object 
 9   Traffic_Vol         30000 non-null  float64
dtypes: float64(5), int64(1), object(4)
memory usage: 2.3+ MB
None
   Holiday  Temperature  Rainfall_last_hour  Snowfall_last_hour  Cloud_Cover  \
0      NaN       289.28                 0.0                 0.0           40   
1      NaN       290.26                 0.0                 0.0           75

In [4]:
# Handle Missing Values
train_df.fillna(method='ffill', inplace=True)
test_df.fillna(method='ffill', inplace=True)


  train_df.fillna(method='ffill', inplace=True)
  test_df.fillna(method='ffill', inplace=True)


In [5]:
# # Feature Engineering: Extract time-based features
# train_df['Timestamp'] = pd.to_datetime(train_df['TimeStamp'])
# test_df['Timestamp'] = pd.to_datetime(test_df['TimeStamp'])

In [6]:
# Feature Engineering: Extract time-based features
train_df['Timestamp'] = pd.to_datetime(train_df['TimeStamp'], format='%d-%m-%Y %H:%M', errors='coerce') # Specify the correct format
test_df['Timestamp'] = pd.to_datetime(test_df['TimeStamp'], format='%d-%m-%Y %H:%M', errors='coerce') # Specify the correct format

In [7]:

for df in [train_df, test_df]:
    df['Hour'] = df['Timestamp'].dt.hour
    df['Day'] = df['Timestamp'].dt.day
    df['Month'] = df['Timestamp'].dt.month
    df['Year'] = df['Timestamp'].dt.year
    df['Weekday'] = df['Timestamp'].dt.weekday
    df.drop(columns=['TimeStamp', 'Timestamp', 'Date'], inplace=True)

In [10]:
# Encode Categorical Variables
# Handle unseen labels during transform
le = LabelEncoder()
categorical_features = ['Weather', 'Weather_Desc']

for feature in categorical_features:
    # Convert all values to strings before fitting
    train_df[feature] = train_df[feature].astype(str)
    test_df[feature] = test_df[feature].astype(str)

    # Fit on combined unique values from both train and test
    all_values = pd.concat([train_df[feature], test_df[feature]]).unique()
    le.fit(all_values)

    # Transform both train and test
    train_df[feature] = le.transform(train_df[feature])
    test_df[feature] = le.transform(test_df[feature])

In [11]:
# Select Features
X = train_df.drop(columns=['Traffic_Vol'])
y = train_df['Traffic_Vol']
X_test = test_df.copy()

In [12]:
X

Unnamed: 0,Holiday,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Weather,Weather_Desc,Hour,Day,Month,Year,Weekday
0,,289.28,0.0,0.0,40,3,19,9,2,10,2015,4
1,,290.26,0.0,0.0,75,3,9,10,2,10,2015,4
2,,290.28,0.0,0.0,90,3,10,11,2,10,2015,4
3,,290.33,0.0,0.0,90,3,10,12,2,10,2015,4
4,,292.14,0.0,0.0,75,3,9,13,2,10,2015,4
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,1.0,302.62,0.0,0.0,5,2,3,20,15,7,2020,2
29996,1.0,300.69,0.0,0.0,6,2,3,21,15,7,2020,2
29997,1.0,298.67,0.0,0.0,1,2,3,22,15,7,2020,2
29998,1.0,297.23,0.0,0.0,1,2,3,23,15,7,2020,2


In [13]:
y

Unnamed: 0,Traffic_Vol
0,7639.32
1,6300.32
2,6621.42
3,6958.12
4,6824.22
...,...
29995,4393.22
29996,4622.02
29997,4566.12
29998,3333.72


In [14]:
X_test

Unnamed: 0,Holiday,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Weather,Weather_Desc,Hour,Day,Month,Year,Weekday
0,,295.49,0,0,1,2,3,1,16,7,2020,3
1,,294.49,0,0,2,2,3,2,16,7,2020,3
2,,293.92,0,0,1,2,3,3,16,7,2020,3
3,,292.82,0,0,1,2,3,4,16,7,2020,3
4,,292.02,0,0,2,2,3,5,16,7,2020,3
...,...,...,...,...,...,...,...,...,...,...,...,...
8368,1.0,301.38,0,0,90,3,10,19,30,6,2021,2
8369,1.0,300.28,0,0,91,3,10,20,30,6,2021,2
8370,1.0,300.48,0,0,91,3,10,21,30,6,2021,2
8371,1.0,299.08,0,0,75,3,9,22,30,6,2021,2


In [15]:
# Scale Numeric Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [16]:
# Split Data
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [17]:
# Train Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [18]:
# Evaluate Model
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'Validation RMSE: {rmse}')

Validation RMSE: 568.9323638380174


In [19]:
# Predict on Test Data
test_predictions = model.predict(X_test_scaled)
test_df['Traffic_Vol'] = test_predictions

In [20]:
# Save Predictions
test_df[['Traffic_Vol']].to_csv('submission.csv', index=False)

In [21]:
y_baseline = np.full_like(y_val, y_train.mean())

In [22]:
y_baseline

array([4670.5706125, 4670.5706125, 4670.5706125, ..., 4670.5706125,
       4670.5706125, 4670.5706125])

In [23]:
import numpy as np
from sklearn.metrics import mean_squared_error

y_baseline = np.full_like(y_val, y_train.mean())  # Predicting mean of training data
baseline_rmse = mean_squared_error(y_val, y_baseline) ** 0.5

print(f"Baseline RMSE: {baseline_rmse}")
print(f"Your Model RMSE: {rmse}")


Baseline RMSE: 2598.8383614288905
Your Model RMSE: 568.9323638380174


In [24]:
from xgboost import XGBRegressor
# Train XGBoost Model
model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=7, random_state=42)
model.fit(X_train, y_train)


In [25]:
# Evaluate Model
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'Validation RMSE: {rmse}')

Validation RMSE: 559.293090202154


# **Modification**

In [26]:
from sklearn.linear_model import LinearRegression


In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

from google.colab import drive
drive.mount('/content/drive')

# Load Data
train_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/DS/Dataset/Train.csv")
test_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/DS/Dataset/Test.csv")

# Handle Missing Values
train_df.fillna(method='ffill', inplace=True)
test_df.fillna(method='ffill', inplace=True)







Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  train_df.fillna(method='ffill', inplace=True)
  test_df.fillna(method='ffill', inplace=True)


In [28]:
# Convert TimeStamp to Datetime
train_df['Timestamp'] = pd.to_datetime(train_df['TimeStamp'], format='%d-%m-%Y %H:%M', errors='coerce') # Specify the correct format
test_df['Timestamp'] = pd.to_datetime(test_df['TimeStamp'], format='%d-%m-%Y %H:%M', errors='coerce') # Specify the correct format

for df in [train_df, test_df]:
    df['Hour'] = df['Timestamp'].dt.hour
    df['Day'] = df['Timestamp'].dt.day
    df['Month'] = df['Timestamp'].dt.month
    df['Year'] = df['Timestamp'].dt.year
    df['Weekday'] = df['Timestamp'].dt.weekday
    df['Is_Weekend'] = df['Weekday'].apply(lambda x: 1 if x in [5, 6] else 0)
    df['Rush_Hour'] = df['Hour'].apply(lambda x: 1 if (7 <= x <= 10) or (16 <= x <= 19) else 0)
    df['Day_of_Year'] = df['Timestamp'].dt.dayofyear
    df['Week_of_Year'] = df['Timestamp'].dt.isocalendar().week
    df.drop(columns=['TimeStamp', 'Timestamp', 'Date'], inplace=True, errors='ignore')

# Weather Features
for df in [train_df, test_df]:
    df['Temp_Bin'] = pd.cut(df['Temperature'], bins=[-10, 5, 20, 40], labels=['Cold', 'Moderate', 'Hot'])
    df['Rain_Indicator'] = df['Weather_Desc'].apply(lambda x: 1 if 'rain' in x.lower() else 0)
    df['Snow_Indicator'] = df['Weather_Desc'].apply(lambda x: 1 if 'snow' in x.lower() else 0)
    df['Cloud_Cover_Cat'] = pd.cut(df['Cloud_Cover'], bins=[0, 30, 70, 100], labels=['Low', 'Medium', 'High'])

# # Encode Categorical Variables
# le = LabelEncoder()
# categorical_features = ['Weather', 'Weather_Desc', 'Temp_Bin', 'Cloud_Cover_Cat']
# for feature in categorical_features:
#     train_df[feature] = le.fit_transform(train_df[feature].astype(str))
#     test_df[feature] = le.transform(test_df[feature].astype(str))

In [29]:
#df.drop(columns=['TimeStamp', 'Timestamp', 'Date'], inplace=True)

In [30]:
# Encode Categorical Variables
le = LabelEncoder()
categorical_features = ['Weather', 'Weather_Desc', 'Temp_Bin', 'Cloud_Cover_Cat']
for feature in categorical_features:
    # Fit on combined unique values from both train and test
    all_values = pd.concat([train_df[feature].astype(str), test_df[feature].astype(str)]).unique()
    le.fit(all_values)

    # Transform both train and test
    train_df[feature] = le.transform(train_df[feature].astype(str))
    test_df[feature] = le.transform(test_df[feature].astype(str))

In [31]:
# Lag Features
train_df['Lag_Traffic_1'] = train_df['Traffic_Vol'].shift(1)
train_df['Rolling_Mean_3'] = train_df['Traffic_Vol'].rolling(window=3).mean()
train_df['Rolling_Std_3'] = train_df['Traffic_Vol'].rolling(window=3).std()
train_df.fillna(0, inplace=True)  # Fill NaN from rolling stats

# Prepare Features
X = train_df.drop(columns=['Traffic_Vol'])
y = train_df['Traffic_Vol']
X_test = test_df.copy()



In [32]:
# Lag Features for Test Data
X_test['Lag_Traffic_1'] = X_test['Holiday'].shift(1)  # Or any relevant feature for lagging
X_test['Rolling_Mean_3'] = X_test['Holiday'].rolling(window=3).mean()  # Or any relevant feature for rolling mean
X_test['Rolling_Std_3'] = X_test['Holiday'].rolling(window=3).std()  # Or any relevant feature for rolling std

X_test.fillna(0, inplace=True)  # Fill NaN from rolling stats

# Scale Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# ... rest of your code

In [33]:


# Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train XGBoost Model
model = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=7, random_state=42)
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'Validation RMSE: {rmse}')

Validation RMSE: 195.28335994232103
