# LINEAR REGRESSION FOR TRAFFIC

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn import metrics
import pandas as pd
import numpy as np
import sys

In [2]:
own_eqn_df = pd.read_csv('own_eqn_subdataset.csv')
own_eqn_df

Unnamed: 0,e,day,time,location,traffic_density
0,2.220446e-16,0.05,1.0,0.1,0.383333
1,2.220446e-16,0.05,1.0,0.1,0.383333
2,2.220446e-16,0.05,0.1,1.0,0.383333
3,2.220446e-16,1.00,0.1,1.0,0.700000
4,2.220446e-16,0.05,1.0,0.1,0.383333
...,...,...,...,...,...
54125939,2.220446e-16,1.00,0.1,0.1,0.400000
54125940,2.220446e-16,0.05,0.1,0.1,0.083333
54125941,2.220446e-16,1.00,0.1,0.1,0.400000
54125942,2.220446e-16,0.05,0.1,0.1,0.083333


In [3]:
traffic_labels = own_eqn_df[['traffic_density']]
traffic_labels

Unnamed: 0,traffic_density
0,0.383333
1,0.383333
2,0.383333
3,0.700000
4,0.383333
...,...
54125939,0.400000
54125940,0.083333
54125941,0.400000
54125942,0.083333


In [4]:
del own_eqn_df['traffic_density']
own_eqn_df

Unnamed: 0,e,day,time,location
0,2.220446e-16,0.05,1.0,0.1
1,2.220446e-16,0.05,1.0,0.1
2,2.220446e-16,0.05,0.1,1.0
3,2.220446e-16,1.00,0.1,1.0
4,2.220446e-16,0.05,1.0,0.1
...,...,...,...,...
54125939,2.220446e-16,1.00,0.1,0.1
54125940,2.220446e-16,0.05,0.1,0.1
54125941,2.220446e-16,1.00,0.1,0.1
54125942,2.220446e-16,0.05,0.1,0.1


https://www.kdnuggets.com/2019/03/beginners-guide-linear-regression-python-scikit-learn.html

In [5]:
X_train, X_test, y_train, y_test = train_test_split(own_eqn_df, traffic_labels, test_size=0.2, random_state=42)

print("Length of X_train:", len(X_train))
print("Lenth of X_test:", len(X_test))

Length of X_train: 43300755
Lenth of X_test: 10825189


In [6]:
sgdr = SGDRegressor()
sgdr

SGDRegressor()

In [7]:
sgdr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SGDRegressor()

In [8]:
score = sgdr.score(X_train, y_train)
print(score)

0.9999994853549009


In [9]:
sgdr.get_params()

{'alpha': 0.0001,
 'average': False,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.01,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'invscaling',
 'loss': 'squared_error',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'penalty': 'l2',
 'power_t': 0.25,
 'random_state': None,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [10]:
weights = sgdr.coef_
bias = sgdr.intercept_

print('Weights:', weights)
print('Bias:', bias)

Weights: [8.32199613e-23 3.33126264e-01 3.33042499e-01 3.33102844e-01]
Bias: [0.00025911]


### Generating new values for Traffic Density

#### Importing checkpoint 5

In [None]:
#Loading the checkpoint5 
checkpoint5 = pd.read_csv('../checkpoint-5.csv')
checkpoint5.head(5)

In [None]:
#converting the weekend column from boolean to binary
checkpoint5.replace({False: 0, True: 1}, inplace=True)
checkpoint5

#### Generating Traffic Density using the new weights

In [13]:
#function to calculate the equation
def calc_traffic_density(weekend, public_hol, peak_hr, hotspot, weights):
    #defining the weights and the small constant
    WEIGHT_A = weights[1]
    WEIGHT_B = weights[2]
    WEIGHT_C = weights[3]
    EPSILON = sys.float_info.epsilon
    
    #Getting the Unweighted values
    day = max((weekend or public_hol),0.05)
    time = max(peak_hr,0.1)
    place = max(hotspot,0.1)
    
    #Getting the Weighted Values
    weighted_day = WEIGHT_A * day
    weighted_time = WEIGHT_B * time
    weighted_place = WEIGHT_C * place
    
    #Summing up all the variables
    traffic = weighted_day + weighted_place + weighted_time + EPSILON
    
    return traffic

In [16]:
#Iterating through the dataset
traffic_density = []
for index, row in checkpoint5.iterrows():
    traffic_value = calc_traffic_density(weekend=row['weekend'], public_hol=row['holiday'], peak_hr=row['peak_hour'], hotspot=row['hotspot'], weights=weights)
    traffic_density.insert(index,traffic_value)

In [17]:
traffic_density

[0.3830090964479443,
 0.3830090964479443,
 0.3830634073985316,
 0.6995333582836794,
 0.3830090964479443,
 0.08327084751698283,
 0.08327084751698283,
 0.3830090964479443,
 0.3830634073985316,
 0.08327084751698283,
 0.3997407984021306,
 0.3997407984021306,
 0.08327084751698283,
 0.3830090964479443,
 0.6828016563294931,
 0.3830090964479443,
 0.08327084751698283,
 0.3997407984021306,
 0.08327084751698283,
 0.08327084751698283,
 0.3830090964479443,
 0.08327084751698283,
 0.6828016563294931,
 0.3830090964479443,
 0.3830634073985316,
 0.3997407984021306,
 0.3997407984021306,
 0.6828016563294931,
 0.3997407984021306,
 0.6994790473330921,
 0.3830634073985316,
 0.08327084751698283,
 0.3997407984021306,
 0.3830090964479443,
 0.08327084751698283,
 0.3997407984021306,
 0.08327084751698283,
 0.6828016563294931,
 0.3997407984021306,
 0.6994790473330921,
 0.3830634073985316,
 0.6994790473330921,
 0.08327084751698283,
 0.08327084751698283,
 0.6828016563294931,
 0.3830634073985316,
 0.3830634073985316,


In [18]:
traffic_density_df = pd.DataFrame(traffic_density)
traffic_density_df.to_csv('SGDR_traffic_density.csv')

In [19]:
checkpoint5_sgdr_traffic = checkpoint5.join(traffic_density_df)
checkpoint5_sgdr_traffic

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,time,day,weekend,holiday,peak_hour,hotspot,dist,0
0,4.5,-73.844311,40.721319,-73.841610,40.712278,1,17:26:21,0,0,0,1,0,1.030764,0.383009
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,16:52:16,1,0,0,1,0,8.450134,0.383009
2,5.7,-73.982738,40.761270,-73.991242,40.750562,2,00:35:00,3,0,0,0,1,1.389525,0.383063
3,7.7,-73.987130,40.733143,-73.991567,40.758092,1,04:30:42,5,1,0,0,1,2.799270,0.699533
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,07:51:00,1,0,0,1,0,1.999157,0.383009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54125939,14.0,-74.005272,40.740027,-73.963280,40.762555,1,03:28:00,5,1,0,0,0,4.334397,0.399741
54125940,4.2,-73.957784,40.765530,-73.951640,40.773959,1,20:46:20,1,0,0,0,0,1.070590,0.083271
54125941,14.1,-73.970505,40.752325,-73.960537,40.797342,1,22:04:24,5,1,0,0,0,5.075548,0.399741
54125942,28.9,-73.980901,40.764629,-73.870605,40.773963,1,05:57:51,2,0,0,0,0,9.346157,0.083271


In [20]:
checkpoint5_sgdr_traffic.to_csv('checkpoint5_SGDR_traffic.csv', index=False)