In [2]:
import pandas as pd

# Load the uploaded dataset to examine its structure
file_path = "Train.csv"
train_data = pd.read_csv(file_path)

# Display the first few rows and basic information about the dataset
train_data.head(), train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38373 entries, 0 to 38372
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Holiday             50 non-null     float64
 1   Temperature         38373 non-null  float64
 2   Rainfall_last_hour  38373 non-null  float64
 3   Snowfall_last_hour  38373 non-null  float64
 4   Cloud_Cover         38373 non-null  int64  
 5   Weather             38373 non-null  object 
 6   Weather_Desc        38373 non-null  object 
 7   TimeStamp           38373 non-null  object 
 8   Date                38373 non-null  object 
 9   Traffic_Vol         38373 non-null  int64  
dtypes: float64(4), int64(2), object(4)
memory usage: 2.9+ MB


(   Holiday  Temperature  Rainfall_last_hour  Snowfall_last_hour  Cloud_Cover  \
 0      NaN       289.28                 0.0                 0.0           40   
 1      NaN       290.26                 0.0                 0.0           75   
 2      NaN       290.28                 0.0                 0.0           90   
 3      NaN       290.33                 0.0                 0.0           90   
 4      NaN       292.14                 0.0                 0.0           75   
 
         Weather         Weather_Desc       TimeStamp      Date  Traffic_Vol  
 0  Cloudy skies  Partly cloudy skies   02/10/08 9:00  02/10/08         5555  
 1  Cloudy skies    Fragmented clouds  02/10/08 10:00  02/10/08         4525  
 2  Cloudy skies     Full cloud cover  02/10/08 11:00  02/10/08         4772  
 3  Cloudy skies     Full cloud cover  02/10/08 12:00  02/10/08         5031  
 4  Cloudy skies    Fragmented clouds  02/10/08 13:00  02/10/08         4928  ,
 None)

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Preprocessing and feature engineering
def preprocess_data(data):
    # Handle missing values in 'Holiday' (replace NaN with 'No Holiday')
    data['Holiday'] = data['Holiday'].fillna(0)  # Assuming 0 = No Holiday

    # Convert 'TimeStamp' to datetime and extract time-based features
    data['TimeStamp'] = pd.to_datetime(data['TimeStamp'], format='%d/%m/%y %H:%M')
    data['Hour'] = data['TimeStamp'].dt.hour
    data['Day'] = data['TimeStamp'].dt.day
    data['Month'] = data['TimeStamp'].dt.month
    data['Year'] = data['TimeStamp'].dt.year
    data['Weekday'] = data['TimeStamp'].dt.weekday  # Monday = 0, Sunday = 6

    # Encode categorical variables
    label_encoder_weather = LabelEncoder()
    data['Weather_Encoded'] = label_encoder_weather.fit_transform(data['Weather'])

    label_encoder_weather_desc = LabelEncoder()
    data['Weather_Desc_Encoded'] = label_encoder_weather_desc.fit_transform(data['Weather_Desc'])

    # Drop unused columns
    data = data.drop(columns=['TimeStamp', 'Date', 'Weather', 'Weather_Desc'])

    return data

# Apply preprocessing
processed_data = preprocess_data(train_data)

# Split data into features and target
X = processed_data.drop(columns=['Traffic_Vol'])
y = processed_data['Traffic_Vol']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head(), y_train.head()


(       Holiday  Temperature  Rainfall_last_hour  Snowfall_last_hour  \
 24782      0.0      266.170                 0.0                 0.0   
 26404      0.0      265.729                 0.0                 0.0   
 2331       0.0      274.430                 0.0                 0.0   
 32798      0.0      262.650                 0.0                 0.0   
 10917      0.0      271.130                 0.0                 0.0   
 
        Cloud_Cover  Hour  Day  Month  Year  Weekday  Weather_Encoded  \
 24782           90    23    8     12  2012        5                8   
 26404            0     4   15      2  2013        4                2   
 2331            90    21   11      1  2009        6                7   
 32798            1    22    9     11  2013        5                2   
 10917           75     6    7      3  2010        6                3   
 
        Weather_Desc_Encoded  
 24782                     8  
 26404                     3  
 2331                     25  
 3

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Initialize the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf_model.predict(X_val)

# Evaluate the model
mae = mean_absolute_error(y_val, y_pred)

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("MAE:",mae)
print(feature_importance)


MAE: 233.35093941368078
                 Feature  Importance
5                   Hour    0.831961
9                Weekday    0.105226
1            Temperature    0.018548
6                    Day    0.015754
7                  Month    0.010294
8                   Year    0.005975
4            Cloud_Cover    0.004633
11  Weather_Desc_Encoded    0.003247
10       Weather_Encoded    0.002996
2     Rainfall_last_hour    0.001300
3     Snowfall_last_hour    0.000045
0                Holiday    0.000020


In [7]:
# Load the test dataset
test_file_path = 'Test.csv'
test_data = pd.read_csv(test_file_path)

# Display the first few rows of the test dataset to understand its structure
test_data.head()


Unnamed: 0,Holiday,Temperature,Rainfall_last_hour,Snowfall_last_hour,Cloud_Cover,Weather,Weather_Desc,TimeStamp,Date,Traffic_Vol
0,,297.65,0.0,0,1,Clear skies,Clear skies,01/07/14 0:00,01/07/14,
1,,297.45,0.0,0,1,Clear skies,Clear skies,01/07/14 1:00,01/07/14,
2,,296.75,0.0,0,1,Rainfall,Soft rain,01/07/14 2:00,01/07/14,
3,,296.42,0.0,0,1,Rainfall,Steady rain,01/07/14 3:00,01/07/14,
4,,295.56,0.0,0,40,Rainfall,Soft rain,01/07/14 4:00,01/07/14,


In [8]:
# Load the submission file to understand its structure
submission_file_path = 'Submission.csv'
submission_data = pd.read_csv(submission_file_path)

# Display the first few rows of the submission file
submission_data.head()


Unnamed: 0,Traffic_Vol
0,0
1,0
2,0
3,0
4,0


In [13]:

# Preprocess the test data (reuse the preprocessing function)
processed_test_data = preprocess_data(test_data)

# Ensure the test data matches the training data structure
X_test = processed_test_data.drop(columns=['Traffic_Vol'], errors='ignore')  # 'Traffic_Vol' is not in test

# Assign the trained rf_model to best_model
best_model = rf_model

# Predict using the trained model
test_predictions = best_model.predict(X_test)

# Replace the prediction column
submission_data['Traffic_Volume'] = test_predictions

# Save the updated submission file
submission_data.to_csv('Submission.csv', index=False)

print("Predictions saved to Submission.csv")

Predictions saved to Submission.csv
