In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from sklearn.preprocessing import QuantileTransformer, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [None]:
# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

In [None]:
# Load datasetsnput/traffic/TrafficTwoMonth.csv
traffic_df = pd.read_csv('/kaggle/input/traffic-11/Traffic.csv')
traffic_two_month_df = pd.read_csv('/kaggle/input/traffic-22/TrafficTwoMonth.csv')

In [None]:
traffic_df.head(3)

In [None]:
traffic_df['Traffic Situation'].value_counts()

In [None]:
# Combine datasets
traffic_df['Source'] = 'OneMonth'
traffic_two_month_df['Source'] = 'TwoMonth'
combined_df = pd.concat([traffic_df, traffic_two_month_df], ignore_index=True)

In [None]:
combined_df.shape

In [None]:
#Distribution of vehicle counts for cars, bikes, buses, and trucks
fig = make_subplots(rows=2, cols=2, subplot_titles=("Car Counts", "Bike Counts", "Bus Counts", "Truck Counts"))

fig.add_trace(go.Histogram(x=combined_df['CarCount'], name='Car Counts', marker_color='#1f77b4'), row=1, col=1)
fig.add_trace(go.Histogram(x=combined_df['BikeCount'], name='Bike Counts', marker_color='#ff7f0e'), row=1, col=2)
fig.add_trace(go.Histogram(x=combined_df['BusCount'], name='Bus Counts', marker_color='#2ca02c'), row=2, col=1)
fig.add_trace(go.Histogram(x=combined_df['TruckCount'], name='Truck Counts', marker_color='#d62728'), row=2, col=2)

fig.update_layout(title_text='Distribution of Vehicle Counts', title_x=0.5, showlegend=False, template='plotly_white')
fig.update_xaxes(title_text="Count")
fig.update_yaxes(title_text="Frequency")
fig.show()

In [None]:
# Distribution of traffic situations
fig = px.pie(combined_df, names='Traffic Situation', title='Traffic Situation Distribution', color_discrete_sequence=px.colors.sequential.RdBu)
fig.update_layout(title_text='Traffic Situation Distribution', title_x=0.5, template='plotly_white')
fig.show()

In [None]:
#Correlations between different vehicle types
corr_matrix = combined_df[['CarCount', 'BikeCount', 'BusCount', 'TruckCount', 'Total']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Vehicle Counts')
plt.show()

In [None]:
# Remove outliers using IQR method
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df
vehicle_cols = ['CarCount', 'BikeCount', 'BusCount', 'TruckCount']
combined_df = remove_outliers(combined_df, vehicle_cols)

In [None]:
 #Check for missing values and duplicates
print("Missing values in each column:")
print(combined_df.isnull().sum())

print(f"Number of duplicate rows: {combined_df.duplicated().sum()}")

In [None]:
# Normalize vehicle count columns
scaler = QuantileTransformer(output_distribution='normal')
combined_df[vehicle_cols] = scaler.fit_transform(combined_df[vehicle_cols])

In [None]:
# Check distribution after normalization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
sns.histplot(combined_df['CarCount'], ax=axes[0, 0], kde=True, color='#1f77b4')
sns.histplot(combined_df['BikeCount'], ax=axes[0, 1], kde=True, color='#ff7f0e')
sns.histplot(combined_df['BusCount'], ax=axes[1, 0], kde=True, color='#2ca02c')
sns.histplot(combined_df['TruckCount'], ax=axes[1, 1], kde=True, color='#d62728')
axes[0, 0].set_title('Normalized Car Count')
axes[0, 1].set_title('Normalized Bike Count')
axes[1, 0].set_title('Normalized Bus Count')
axes[1, 1].set_title('Normalized Truck Count')
plt.tight_layout()
plt.show()

In [None]:
# Feature engineering
combined_df['Hour'] = pd.to_datetime(combined_df['Time'], errors='coerce').dt.hour
combined_df['Day'] = pd.to_datetime(combined_df['Date'], errors='coerce').dt.day
combined_df['Month'] = pd.to_datetime(combined_df['Date'], errors='coerce').dt.month
combined_df['Weekend'] = combined_df['Day of the week'].isin(['Friday', 'Saturday']).astype(int)

In [None]:
# Features
numerical_features= ['CarCount', 'BikeCount', 'BusCount', 'TruckCount', 'Total', 'Hour', 'Day', 'Month', 'Weekend']
categorical_features = ['Day of the week', 'Source']

In [None]:
# Target encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(combined_df['Traffic Situation'])
X = combined_df[numerical_features + categorical_features]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**MLP**

In [None]:
# 1. Separate features and target
X = combined_df.drop(columns=['Traffic Situation'])
y = combined_df['Traffic Situation']

In [None]:
# 3. Identify categorical and numerical columns
categorical_cols = ['Day of the week', 'Source']
numerical_cols = [col for col in X_train.columns if col not in categorical_cols]

In [None]:
# 4. OPTIONAL: remove 'Date' if it's in numerical_cols and not in X_train
if 'Date' in numerical_cols and 'Date' not in X_train.columns:
    numerical_cols.remove('Date')

In [None]:
# 5. Build preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_cols)
    ]
)

In [None]:
# 6. Define MLP Classifier
mlp = MLPClassifier(max_iter=2000, random_state=42)

# 7. Create full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', mlp)
])

In [None]:
# 8. Define parameter grid for GridSearchCV
param_grid = {
    'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'classifier__activation': ['relu', 'tanh'],
    'classifier__solver': ['adam'],
    'classifier__alpha': [0.0001, 0.001],
    'classifier__learning_rate': ['constant', 'adaptive']
}

In [None]:
# 9. Debugging prints (optional)
print("X_train columns:", X_train.columns.tolist())
print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)

In [None]:

# 10. Run GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
#  Print best parameters
print("Best parameters found:")
print(grid_search.best_params_)

In [None]:

# Evaluate on test set
y_pred = grid_search.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Check for missing values in numerical columns
missing_values = combined_df[numerical_cols].isnull().sum()
print("Missing values per column:")
print(missing_values)

# Check for non-numeric values in supposed numeric columns
for col in numerical_cols:
    try:
        combined_df[col].astype(float)
    except ValueError:
        print(f"Non-numeric values found in column: {col}")

In [None]:


# Predict on test data using the best estimator
y_pred = grid_search.best_estimator_.predict(X_test)

In [None]:


# Evaluate the model performance with various metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Get the original class labels from LabelEncoder
class_labels = label_encoder.classes_

# Plot the heatmap with dynamic class labels
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_labels,
            yticklabels=class_labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:

from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

# Binarize the output labels (required for multi-class AUC-ROC)
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))

# Get the predicted probabilities from the trained MLP model
y_score = grid_search.best_estimator_.predict_proba(X_test)

# Compute the AUC-ROC score (multi-class)
auc_roc = roc_auc_score(y_test_bin, y_score, average='weighted', multi_class='ovr')

print("AUC-ROC (Multi-class):", auc_roc)