# TRAFFIC FLOW PREDICTION

## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
df= pd.read_csv("Traffic.csv")
df

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,12:00:00 AM,10,Tuesday,31,0,4,4,39,low
1,12:15:00 AM,10,Tuesday,49,0,3,3,55,low
2,12:30:00 AM,10,Tuesday,46,0,3,6,55,low
3,12:45:00 AM,10,Tuesday,51,0,2,5,58,low
4,1:00:00 AM,10,Tuesday,57,6,15,16,94,normal
...,...,...,...,...,...,...,...,...,...
2971,10:45:00 PM,9,Thursday,16,3,1,36,56,normal
2972,11:00:00 PM,9,Thursday,11,0,1,30,42,normal
2973,11:15:00 PM,9,Thursday,15,4,1,25,45,normal
2974,11:30:00 PM,9,Thursday,16,5,0,27,48,normal


In [3]:
df.head()

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,12:00:00 AM,10,Tuesday,31,0,4,4,39,low
1,12:15:00 AM,10,Tuesday,49,0,3,3,55,low
2,12:30:00 AM,10,Tuesday,46,0,3,6,55,low
3,12:45:00 AM,10,Tuesday,51,0,2,5,58,low
4,1:00:00 AM,10,Tuesday,57,6,15,16,94,normal


In [4]:
df.shape

(2976, 9)

In [5]:
df.tail()

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
2971,10:45:00 PM,9,Thursday,16,3,1,36,56,normal
2972,11:00:00 PM,9,Thursday,11,0,1,30,42,normal
2973,11:15:00 PM,9,Thursday,15,4,1,25,45,normal
2974,11:30:00 PM,9,Thursday,16,5,0,27,48,normal
2975,11:45:00 PM,9,Thursday,14,3,1,15,33,normal


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2976 entries, 0 to 2975
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Time               2976 non-null   object
 1   Date               2976 non-null   int64 
 2   Day of the week    2976 non-null   object
 3   CarCount           2976 non-null   int64 
 4   BikeCount          2976 non-null   int64 
 5   BusCount           2976 non-null   int64 
 6   TruckCount         2976 non-null   int64 
 7   Total              2976 non-null   int64 
 8   Traffic Situation  2976 non-null   object
dtypes: int64(6), object(3)
memory usage: 209.4+ KB


In [7]:
df.isnull().sum()

Time                 0
Date                 0
Day of the week      0
CarCount             0
BikeCount            0
BusCount             0
TruckCount           0
Total                0
Traffic Situation    0
dtype: int64

In [8]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
2971    False
2972    False
2973    False
2974    False
2975    False
Length: 2976, dtype: bool

In [9]:
df.columns

Index(['Time', 'Date', 'Day of the week', 'CarCount', 'BikeCount', 'BusCount',
       'TruckCount', 'Total', 'Traffic Situation'],
      dtype='object')

In [10]:
df.describe()

Unnamed: 0,Date,CarCount,BikeCount,BusCount,TruckCount,Total
count,2976.0,2976.0,2976.0,2976.0,2976.0,2976.0
mean,16.0,68.696573,14.917339,15.27957,15.324933,114.218414
std,8.945775,45.850693,12.847518,14.341986,10.603833,60.190627
min,1.0,6.0,0.0,0.0,0.0,21.0
25%,8.0,19.0,5.0,1.0,6.0,55.0
50%,16.0,64.0,12.0,12.0,14.0,109.0
75%,24.0,107.0,22.0,25.0,23.0,164.0
max,31.0,180.0,70.0,50.0,40.0,279.0


In [11]:
df.value_counts

<bound method DataFrame.value_counts of              Time  Date Day of the week  CarCount  BikeCount  BusCount  \
0     12:00:00 AM    10         Tuesday        31          0         4   
1     12:15:00 AM    10         Tuesday        49          0         3   
2     12:30:00 AM    10         Tuesday        46          0         3   
3     12:45:00 AM    10         Tuesday        51          0         2   
4      1:00:00 AM    10         Tuesday        57          6        15   
...           ...   ...             ...       ...        ...       ...   
2971  10:45:00 PM     9        Thursday        16          3         1   
2972  11:00:00 PM     9        Thursday        11          0         1   
2973  11:15:00 PM     9        Thursday        15          4         1   
2974  11:30:00 PM     9        Thursday        16          5         0   
2975  11:45:00 PM     9        Thursday        14          3         1   

      TruckCount  Total Traffic Situation  
0              4     39    

In [12]:
df.size

26784

In [13]:
df.sum()

Time                 12:00:00 AM12:15:00 AM12:30:00 AM12:45:00 AM1:...
Date                                                             47616
Day of the week      TuesdayTuesdayTuesdayTuesdayTuesdayTuesdayTues...
CarCount                                                        204441
BikeCount                                                        44394
BusCount                                                         45472
TruckCount                                                       45607
Total                                                           339914
Traffic Situation    lowlowlowlownormallowlowlowlowlowlowlowlowlowh...
dtype: object

## Model

In [14]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [15]:
#To change, the times from categorical to numerical in 24hr format
df['Time'] = pd.to_datetime(df['Time']).dt.hour * 60 + pd.to_datetime(df['Time']).dt.minute
df['Time'] = df['Time'] / 60.0

  df['Time'] = pd.to_datetime(df['Time']).dt.hour * 60 + pd.to_datetime(df['Time']).dt.minute
  df['Time'] = pd.to_datetime(df['Time']).dt.hour * 60 + pd.to_datetime(df['Time']).dt.minute


In [16]:
df.head()

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,0.0,10,Tuesday,31,0,4,4,39,low
1,0.25,10,Tuesday,49,0,3,3,55,low
2,0.5,10,Tuesday,46,0,3,6,55,low
3,0.75,10,Tuesday,51,0,2,5,58,low
4,1.0,10,Tuesday,57,6,15,16,94,normal


In [17]:
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_percentage

Time                 0.0
Date                 0.0
Day of the week      0.0
CarCount             0.0
BikeCount            0.0
BusCount             0.0
TruckCount           0.0
Total                0.0
Traffic Situation    0.0
dtype: float64

In [18]:
day_of_week_mapping = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
df['Day of the week'].replace(day_of_week_mapping, inplace=True)

traffic_situation_mapping = {'low': 0, 'normal': 1, 'high': 2, 'heavy':3}
df['Traffic Situation'].replace(traffic_situation_mapping, inplace=True)

In [19]:
df.head()

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,0.0,10,2,31,0,4,4,39,0
1,0.25,10,2,49,0,3,3,55,0
2,0.5,10,2,46,0,3,6,55,0
3,0.75,10,2,51,0,2,5,58,0
4,1.0,10,2,57,6,15,16,94,1


In [20]:
# Define X (features) and y (target)
X = df.drop(columns=['Traffic Situation'])  # Features
y = df['Traffic Situation']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply feature selection using SelectKBest with chi-squared test

k =6 # Replace 'k' with the desired number of features to select
selector = SelectKBest(score_func=chi2, k=k)
X_new = selector.fit_transform(X_train, y_train)
print(X_new)
# Get the indices of the selected features
selected_feature_indices = selector.get_support(indices=True)

# Print the selected feature indices
selected_feature_indices

# Now, 'X_new' contains the selected features.

[[  5.25  64.    27.     5.     5.   101.  ]
 [  7.25 150.    13.    31.     4.   198.  ]
 [  7.   134.    10.    44.     0.   188.  ]
 ...
 [ 18.5  116.    22.    23.     7.   168.  ]
 [ 11.5   40.     9.    15.    28.    92.  ]
 [ 23.    20.     1.     0.    27.    48.  ]]


array([0, 3, 4, 5, 6, 7])

In [21]:
df.columns

Index(['Time', 'Date', 'Day of the week', 'CarCount', 'BikeCount', 'BusCount',
       'TruckCount', 'Total', 'Traffic Situation'],
      dtype='object')

In [22]:
X = df[['Time','CarCount', 'BusCount','BikeCount','TruckCount','Total']]
y = df['Traffic Situation']

In [23]:
from sklearn.metrics import accuracy_score,mean_absolute_error,mean_squared_error,precision_score, recall_score, f1_score,confusion_matrix,classification_report

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
#RandomForest classification

RForest_clf = RandomForestClassifier(n_estimators = 100)
RForest_clf.fit(X_train, y_train)
y_pred1 = RForest_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred1)*100
print(f'Accuracy: {accuracy}')

Accuracy: 98.99328859060402


In [26]:
#OUTPUT.............

y=RForest_clf.predict( X_test)
print(y)   #{'low': 0, 'normal': 1, 'high': 2, 'heavy':3}

[0 2 1 2 1 0 1 1 1 1 3 1 0 1 0 2 2 1 1 1 1 1 3 1 3 1 3 1 3 2 1 1 1 0 1 3 2
 0 1 3 1 1 1 1 3 1 1 1 0 0 1 3 1 1 0 1 1 1 1 1 1 3 1 1 0 1 1 1 2 1 0 1 0 1
 1 0 1 2 2 1 1 1 3 1 3 1 1 3 3 3 1 1 2 1 3 3 3 1 1 1 1 1 3 1 1 0 3 1 1 1 3
 1 2 1 3 3 1 3 0 0 2 0 3 0 1 1 0 1 3 0 2 2 1 3 3 1 1 1 1 3 1 2 3 2 3 2 3 3
 0 3 1 3 1 0 0 2 1 1 1 2 3 1 0 2 3 3 3 1 1 1 1 1 1 1 1 2 3 1 1 1 2 0 1 1 1
 1 3 1 3 2 1 1 3 3 1 1 0 3 1 1 1 0 0 1 1 1 0 3 1 1 1 1 1 3 1 3 1 1 1 1 2 1
 1 0 3 1 1 1 1 3 3 1 3 3 1 1 3 3 3 3 3 3 1 3 1 1 1 3 2 1 1 1 0 3 1 1 2 3 2
 3 1 1 1 1 2 1 1 1 1 3 3 3 2 0 0 1 3 1 3 1 1 1 1 1 2 1 2 1 0 3 1 3 2 3 1 1
 2 1 3 3 1 1 2 1 0 3 1 3 1 1 0 1 3 1 1 1 1 1 0 3 3 1 1 1 2 0 3 1 0 1 1 1 3
 3 1 3 0 1 1 2 3 2 2 1 0 2 1 1 1 1 1 1 1 1 1 0 1 1 1 1 3 1 1 2 0 3 3 1 1 2
 1 1 0 1 2 1 3 1 1 1 2 0 1 3 3 1 1 1 1 1 1 1 3 1 3 1 2 0 3 1 3 2 3 1 3 1 2
 2 1 3 0 1 1 1 3 1 1 1 1 1 1 3 1 3 3 1 1 1 1 1 0 1 1 1 1 0 1 3 3 1 1 1 1 1
 1 1 0 2 3 1 0 1 1 3 1 1 1 3 1 1 1 1 3 1 0 1 1 1 2 1 1 1 3 2 3 1 3 1 2 1 1
 1 1 3 3 1 2 1 1 3 2 1 1 

In [27]:
#XGBoost classification

xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train,y_train)
y_pred = xgb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
print(f'Accuracy: {accuracy}')


Accuracy: 100.0


In [28]:
#Output..
y_pred = xgb_classifier.predict( X_test)
print(y)   #{'low': 0, 'normal': 1, 'high': 2, 'heavy':3}

[0 2 1 2 1 0 1 1 1 1 3 1 0 1 0 2 2 1 1 1 1 1 3 1 3 1 3 1 3 2 1 1 1 0 1 3 2
 0 1 3 1 1 1 1 3 1 1 1 0 0 1 3 1 1 0 1 1 1 1 1 1 3 1 1 0 1 1 1 2 1 0 1 0 1
 1 0 1 2 2 1 1 1 3 1 3 1 1 3 3 3 1 1 2 1 3 3 3 1 1 1 1 1 3 1 1 0 3 1 1 1 3
 1 2 1 3 3 1 3 0 0 2 0 3 0 1 1 0 1 3 0 2 2 1 3 3 1 1 1 1 3 1 2 3 2 3 2 3 3
 0 3 1 3 1 0 0 2 1 1 1 2 3 1 0 2 3 3 3 1 1 1 1 1 1 1 1 2 3 1 1 1 2 0 1 1 1
 1 3 1 3 2 1 1 3 3 1 1 0 3 1 1 1 0 0 1 1 1 0 3 1 1 1 1 1 3 1 3 1 1 1 1 2 1
 1 0 3 1 1 1 1 3 3 1 3 3 1 1 3 3 3 3 3 3 1 3 1 1 1 3 2 1 1 1 0 3 1 1 2 3 2
 3 1 1 1 1 2 1 1 1 1 3 3 3 2 0 0 1 3 1 3 1 1 1 1 1 2 1 2 1 0 3 1 3 2 3 1 1
 2 1 3 3 1 1 2 1 0 3 1 3 1 1 0 1 3 1 1 1 1 1 0 3 3 1 1 1 2 0 3 1 0 1 1 1 3
 3 1 3 0 1 1 2 3 2 2 1 0 2 1 1 1 1 1 1 1 1 1 0 1 1 1 1 3 1 1 2 0 3 3 1 1 2
 1 1 0 1 2 1 3 1 1 1 2 0 1 3 3 1 1 1 1 1 1 1 3 1 3 1 2 0 3 1 3 2 3 1 3 1 2
 2 1 3 0 1 1 1 3 1 1 1 1 1 1 3 1 3 3 1 1 1 1 1 0 1 1 1 1 0 1 3 3 1 1 1 1 1
 1 1 0 2 3 1 0 1 1 3 1 1 1 3 1 1 1 1 3 1 0 1 1 1 2 1 1 1 3 2 3 1 3 1 2 1 1
 1 1 3 3 1 2 1 1 3 2 1 1 

### Metrics

In [29]:
from sklearn.metrics import accuracy_score,mean_absolute_error,mean_squared_error,precision_score, recall_score, f1_score,confusion_matrix,classification_report

In [30]:
#accuracy score for Randormforest classification

data_accuracy = accuracy_score(y_test,y_pred1)
print("Accuracy:", data_accuracy)

#confusion matrix for Randormforest classification

conf_matrix = confusion_matrix(y_test,y_pred1)
print('confusion matrix',conf_matrix)

#precision score for Randormforest classification

precision = precision_score(y_test, y_pred1,average='macro')
print("Precision:", precision)

#Classification Report for Randormforest classification

print('\nClassification Report for Randormforest classification:')
print("\n")
print(classification_report(y_test, y_pred1))

Accuracy: 0.9899328859060402
confusion matrix [[ 66   0   0   0]
 [  0 336   2   2]
 [  0   2  61   0]
 [  0   0   0 127]]
Precision: 0.9867082331304156

Classification Report for Randormforest classification:


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        66
           1       0.99      0.99      0.99       340
           2       0.97      0.97      0.97        63
           3       0.98      1.00      0.99       127

    accuracy                           0.99       596
   macro avg       0.99      0.99      0.99       596
weighted avg       0.99      0.99      0.99       596



In [31]:
#accuracy score for XGB classification

data_accuracy = accuracy_score(y_test,y_pred)
print("Accuracy:", data_accuracy)

#confusion matrix for XGB classification

conf_matrix = confusion_matrix(y_test,y_pred)
print('confusion matrix',conf_matrix)

#precision score for XGB classification

precision = precision_score(y_test, y_pred,average='macro')
print("Precision:", precision)

#Classification Report for XGB classification

print('\nClassification Report for XGB classification:')
print("\n")
print(classification_report(y_test, y_pred))

Accuracy: 1.0
confusion matrix [[ 66   0   0   0]
 [  0 340   0   0]
 [  0   0  63   0]
 [  0   0   0 127]]
Precision: 1.0

Classification Report for XGB classification:


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        66
           1       1.00      1.00      1.00       340
           2       1.00      1.00      1.00        63
           3       1.00      1.00      1.00       127

    accuracy                           1.00       596
   macro avg       1.00      1.00      1.00       596
weighted avg       1.00      1.00      1.00       596



In [72]:
file='traffic_pred.pkl'

In [73]:
pickle.dump(xgb_classifier,open(file,'wb'))

In [74]:
loaded_model=pickle.load(open('traffic_pred.pkl','rb'))

In [75]:
X_test.columns

Index(['Time', 'CarCount', 'BusCount', 'BikeCount', 'TruckCount', 'Total'], dtype='object')

In [76]:
loaded_model