In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline


In [5]:
df = pd.read_csv('data.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Duration,Distance,PLong,PLatd,DLong,DLatd,Haversine,Pmonth,Pday,...,Dmin,DDweek,Temp,Precip,Wind,Humid,Solar,Snow,GroundTemp,Dust
0,0,3,50,37.544666,126.888359,37.544666,126.888359,0.0,1,1,...,4,0,-3.2,0.0,0.5,40.0,0.0,0.0,-2.2,25.0
1,1,24,7670,37.506199,127.003944,37.55125,127.035103,5.713529,1,1,...,25,0,-3.2,0.0,0.5,40.0,0.0,0.0,-2.2,25.0
2,2,8,1390,37.54459,127.057083,37.537014,127.061096,0.913702,1,1,...,9,0,-3.2,0.0,0.5,40.0,0.0,0.0,-2.2,25.0
3,3,8,1820,37.571102,127.02356,37.561447,127.03492,1.468027,1,1,...,10,0,-3.2,0.0,0.5,40.0,0.0,0.0,-2.2,25.0
4,4,4,850,37.573242,127.015907,37.565849,127.016403,0.823227,1,1,...,6,0,-3.2,0.0,0.5,40.0,0.0,0.0,-2.2,25.0


In [7]:
df.drop(labels=["Unnamed: 0", "Snow", "Precip", "PLatd", "Pmonth", "PLong", "DLatd", "PDweek","DLong", "Dday", "Dmonth", "DDweek", "Temp", "Wind"], axis=1, inplace=True)


In [8]:
df.head()

Unnamed: 0,Duration,Distance,Haversine,Pday,Phour,Pmin,Dhour,Dmin,Humid,Solar,GroundTemp,Dust
0,3,50,0.0,1,0,0,0,4,40.0,0.0,-2.2,25.0
1,24,7670,5.713529,1,0,0,0,25,40.0,0.0,-2.2,25.0
2,8,1390,0.913702,1,0,0,0,9,40.0,0.0,-2.2,25.0
3,8,1820,1.468027,1,0,1,0,10,40.0,0.0,-2.2,25.0
4,4,850,0.823227,1,0,1,0,6,40.0,0.0,-2.2,25.0


In [9]:
df = df.loc[df["Dust"] * df["Haversine"] * df["Solar"]!= 0.0]
df.reset_index(drop=True, inplace=True)

In [10]:
df.shape

(5366130, 12)

In [11]:
df.head()

Unnamed: 0,Duration,Distance,Haversine,Pday,Phour,Pmin,Dhour,Dmin,Humid,Solar,GroundTemp,Dust
0,72,13620,3.951107,1,9,0,10,12,39.0,0.15,-3.6,28.0
1,4,1130,0.864268,1,9,0,9,5,39.0,0.15,-3.6,28.0
2,30,6850,3.787676,1,9,1,9,31,39.0,0.15,-3.6,28.0
3,4,1080,0.705526,1,9,1,9,6,39.0,0.15,-3.6,28.0
4,3,630,0.540025,1,9,1,9,5,39.0,0.15,-3.6,28.0


### Distribution of Target Variables

In [None]:
plt.figure(figsize=(9,8))
sns.distplot(df['Duration'],color='b',bins=100);

In [None]:
plt.figure(figsize=(9,8))
sns.boxplot(df['Duration']);

In [None]:
df.hist(figsize=(16,20),bins=50,xlabelsize=8, ylabelsize=8);

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [None]:
df_num_corr = df.corr()['Duration']

In [None]:
df_num_corr[:-1]

In [12]:
X = df.drop(['Duration','Pday'],axis=1)
y = df['Duration']

In [13]:
X.shape

(5366130, 10)

In [14]:
X.head()

Unnamed: 0,Distance,Haversine,Phour,Pmin,Dhour,Dmin,Humid,Solar,GroundTemp,Dust
0,13620,3.951107,9,0,10,12,39.0,0.15,-3.6,28.0
1,1130,0.864268,9,0,9,5,39.0,0.15,-3.6,28.0
2,6850,3.787676,9,1,9,31,39.0,0.15,-3.6,28.0
3,1080,0.705526,9,1,9,6,39.0,0.15,-3.6,28.0
4,630,0.540025,9,1,9,5,39.0,0.15,-3.6,28.0


In [15]:
y.shape

(5366130,)

In [16]:
y.shape

(5366130,)

In [17]:
y.head()

0    72
1     4
2    30
3     4
4     3
Name: Duration, dtype: int64

## Splitting Data For Training And Testing

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.25, random_state=100)

In [None]:
lr = LinearRegression(normalize=True)

lr.fit(X_train, y_train)
pred_lr_value = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, r2_score

print(f'Training score : {lr.score(X_train, y_train)}')

print()
print('r2 score:', r2_score(y_test,pred_lr_value))
print('MAE:', mean_absolute_error(y_test, pred_lr_value))
print('MSE:', mean_squared_error(y_test, pred_lr_value))
print('RMSE:', np.sqrt(mean_squared_error(y_test, pred_lr_value)))

In [21]:
xgb = xgb.XGBRegressor()

xgb.fit(X_train, y_train)
pred_xg_value = xgb.predict(X_test)

In [22]:
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, r2_score
print(f'Training score : {xgb.score(X_train, y_train)}')

print()
print('r2 score:', r2_score(y_test,pred_xg_value))
print('MAE:', mean_absolute_error(y_test, pred_xg_value))
print('MSE:', mean_squared_error(y_test, pred_xg_value))
print('RMSE:', np.sqrt(mean_squared_error(y_test, pred_xg_value)))

Training score : 0.9687397851194608

r2 score: 0.967441718249938
MAE: 2.3576723091641245
MSE: 18.459705397531344
RMSE: 4.296475927726274


In [23]:
import joblib
with open("best_model.joblib", 'wb') as x:
    joblib.dump(xgb, x, compress=3)