In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
import sklearn
import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
data=pd.read_csv('../input/uber-and-lyft-dataset-boston-ma/rideshare_kaggle.csv') 
data.head()

In [None]:
data.shape

# --Data set has 693071 observations and 57 features.

In [None]:
data.info()

Removing the features(id,timestamps, textual summary) that predictand(price) has no dependency. As we already have distance,datetime parameters - (hour,day,month) removing latitude,longitude and datetime also from dataframe

In [None]:
data = data.drop(['id','timestamp','datetime','long_summary','apparentTemperatureHighTime','apparentTemperatureLowTime',
                  'apparentTemperatureLowTime','windGustTime','sunriseTime','sunsetTime','uvIndexTime','temperatureMinTime',
                 'temperatureMaxTime','apparentTemperatureMinTime','temperatureLowTime','apparentTemperatureMaxTime'
                 ,'latitude','longitude'],axis = 1)
print(data.shape)
data.head()

Checking the correlation between Price and Temperature related features

In [None]:
data_new = data[['price','temperature','apparentTemperature','temperatureHigh','temperatureLow','apparentTemperatureHigh',
                'apparentTemperatureLow','temperatureMin','temperatureHighTime','temperatureMax','apparentTemperatureMin','apparentTemperatureMax']]
data_new.head()

In [None]:
#plotting heatmap with correlation on data_new dataframe
plt.figure(figsize=(10,10))
sb.heatmap(data_new.corr(),annot=True, mask=np.triu(data_new.corr()))

Removing all the temperature related features from the dataframe as they have very weak correlation with the predictand(price)

In [None]:
data_new = data[['temperature','apparentTemperature','temperatureHigh','temperatureLow','apparentTemperatureHigh',
                'apparentTemperatureLow','temperatureMin','temperatureHighTime','temperatureMax','apparentTemperatureMin','apparentTemperatureMax']]

data = data.drop(data_new.columns,axis=1)
data.head()

Analyzing data in the categorical features

In [None]:
cat_col = data.select_dtypes(include=['object','category']).columns.tolist()
data_new = data[cat_col]
data_new.head()

Checking for unique values in all categorical columns

In [None]:
for col in data_new:
    print(f'{col} : {data_new[col].unique()}')
    print()
    

In [None]:
data_new['product_id'].value_counts()

Removing timezone column as it has only value across all the observations and productid as it has more than 50% of the values with junk data

In [None]:
data = data.drop(['product_id','timezone'],axis=1)
data.head()

Analyzing features with numeric data types

In [None]:
num_col = data.select_dtypes(include=['int64','float64']).columns.tolist()
data_new = data[num_col]
data_new.columns

Analyzing climate related features with price column

In [None]:
cli_col = ['price','precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'visibility', 'dewPoint', 'pressure', 'windBearing',
       'cloudCover', 'uvIndex', 'visibility.1', 'ozone', 'moonPhase',
       'precipIntensityMax']
data_new = data[cli_col]
data_new.head()

In [None]:
#plotting heatmap with correlation on data_new dataframe
plt.figure(figsize=(12,12))
sb.heatmap(data_new.corr(),annot=True, mask=np.triu(data_new.corr()))

All the climate related features has correlation almost 0 with respect to price column. Dropping these columns does'nt effect the accurracy prediction of price

In [None]:
cli_col = ['precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'visibility', 'dewPoint', 'pressure', 'windBearing',
       'cloudCover', 'uvIndex', 'visibility.1', 'ozone', 'moonPhase',
       'precipIntensityMax']
data = data.drop(cli_col,axis=1)
data.head()


In [None]:
data.shape

Encoding all the category columns with the Onehot Encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_col = data.select_dtypes(include=['object','category']).columns.tolist()
print(cat_col)

#intiate OneHotEncoder and concatinating original df with encoded column df's
for col in cat_col:
    encoder = OneHotEncoder(handle_unknown='ignore')
    enc_df = pd.DataFrame(encoder.fit_transform(data[[col]]).toarray())
    enc_df.columns = encoder.get_feature_names([col])
    data = data.drop(col, axis=1)
    data = pd.concat([data, enc_df], axis=1)

In [None]:
data.columns

In [None]:
data.head()

Analyzing source column values with price

In [None]:
src_col = ['price','source_Back Bay', 'source_Beacon Hill', 'source_Boston University',
       'source_Fenway', 'source_Financial District', 'source_Haymarket Square',
       'source_North End', 'source_North Station',
       'source_Northeastern University', 'source_South Station',
       'source_Theatre District', 'source_West End', 'destination_Back Bay']
data_new = data[src_col]
data_new.head()

In [None]:
#plotting heatmap with correlation on data_new dataframe
plt.figure(figsize=(12,12))
sb.heatmap(data_new.corr(),annot=True, mask=np.triu(data_new.corr()))

Even source does'nt have any significant impact on the price. Similarly checking the same for destination column

In [None]:
dst_col = ['price','destination_Back Bay',
       'destination_Beacon Hill', 'destination_Boston University',
       'destination_Fenway', 'destination_Financial District',
       'destination_Haymarket Square', 'destination_North End',
       'destination_North Station', 'destination_Northeastern University',
       'destination_South Station', 'destination_Theatre District',
       'destination_West End']
data_new = data[src_col]
data_new.head()

In [None]:
#plotting heatmap with correlation on data_new dataframe
plt.figure(figsize=(12,12))
sb.heatmap(data_new.corr(),annot=True, mask=np.triu(data_new.corr()))

From the above heatmaps, correlation values of source and destination are very low(almost 0). So removing them and restructing the data frame

In [None]:
cols = ['source_Back Bay', 'source_Beacon Hill', 'source_Boston University',
       'source_Fenway', 'source_Financial District', 'source_Haymarket Square',
       'source_North End', 'source_North Station',
       'source_Northeastern University', 'source_South Station',
       'source_Theatre District', 'source_West End', 'destination_Back Bay','destination_Back Bay',
       'destination_Beacon Hill', 'destination_Boston University',
       'destination_Fenway', 'destination_Financial District',
       'destination_Haymarket Square', 'destination_North End',
       'destination_North Station', 'destination_Northeastern University',
       'destination_South Station', 'destination_Theatre District',
       'destination_West End']
data = data.drop(cols,axis=1)
print(data.shape)
data.head()


In [None]:
data.columns

Checking correlation of summary column with price

In [None]:
summ_cols = ['price','short_summary_ Drizzle ', 'short_summary_ Foggy ',
       'short_summary_ Light Rain ', 'short_summary_ Mostly Cloudy ',
       'short_summary_ Overcast ', 'short_summary_ Partly Cloudy ',
       'short_summary_ Possible Drizzle ', 'short_summary_ Rain ','short_summary_ Clear ']
data_new = data[summ_cols]
data_new.head()

In [None]:
#plotting heatmap with correlation on data_new dataframe
plt.figure(figsize=(12,12))
sb.heatmap(data_new.corr(),annot=True, mask=np.triu(data_new.corr()))

Summary has no impact onn Price, so removing them from data frame

In [None]:
summ_cols = ['short_summary_ Drizzle ', 'short_summary_ Foggy ',
       'short_summary_ Light Rain ', 'short_summary_ Mostly Cloudy ',
       'short_summary_ Overcast ', 'short_summary_ Partly Cloudy ',
       'short_summary_ Possible Drizzle ', 'short_summary_ Rain ','short_summary_ Clear ']
data = data.drop(summ_cols,axis=1)
print(data.shape)
data.head()

Analyzing ICON Columns

In [None]:
ico_cols = ['price','icon_ clear-day ', 'icon_ clear-night ', 'icon_ cloudy ', 'icon_ fog ',
       'icon_ partly-cloudy-day ', 'icon_ partly-cloudy-night ',
       'icon_ rain ']
data_new = data[ico_cols]
data_new.head()

In [None]:
#plotting heatmap with correlation on data_new dataframe
plt.figure(figsize=(12,12))
sb.heatmap(data_new.corr(),annot=True, mask=np.triu(data_new.corr()))

Deleting all ICON related cols as it does not have any effect with Price

In [None]:
nme_cols = ['price','name_Black', 'name_Black SUV',
       'name_Lux', 'name_Lux Black', 'name_Lux Black XL', 'name_Lyft',
       'name_Lyft XL', 'name_Shared', 'name_Taxi', 'name_UberPool',
       'name_UberX', 'name_UberXL', 'name_WAV']
data_new = data[nme_cols]
data_new.head()

In [None]:
ico_cols = ['icon_ clear-day ', 'icon_ clear-night ', 'icon_ cloudy ', 'icon_ fog ',
       'icon_ partly-cloudy-day ', 'icon_ partly-cloudy-night ',
       'icon_ rain ']
data = data.drop(ico_cols,axis=1)
print(data.shape)
data.head()

Analyzing name cols with price

In [None]:
nme_cols = ['price','name_Black', 'name_Black SUV',
       'name_Lux', 'name_Lux Black', 'name_Lux Black XL', 'name_Lyft',
       'name_Lyft XL', 'name_Shared', 'name_Taxi', 'name_UberPool',
       'name_UberX', 'name_UberXL', 'name_WAV']
data_new = data[nme_cols]
data_new.head()

In [None]:
#plotting heatmap with correlation on data_new dataframe
plt.figure(figsize=(12,12))
sb.heatmap(data_new.corr(),annot=True, mask=np.triu(data_new.corr()))

Some values of names do have effect on Price value.

Analyzing remaining columns with Price

In [None]:
rem_cols =['price','hour', 'day', 'month', 'distance', 'surge_multiplier','cab_type_Lyft','cab_type_Uber']
data_new = data[rem_cols]
data_new.head()

In [None]:
#plotting heatmap with correlation on data_new dataframe
plt.figure(figsize=(12,12))
sb.heatmap(data_new.corr(),annot=True, mask=np.triu(data_new.corr()))

From above heatmap, month day, hour,cabtype_Uber & cab_type_Lyft are weakly correlated, but distance and surge_multiplier has good correlation with Price
. So dropping month,day & hour.

In [None]:
data = data.drop(['month','day','hour','cab_type_Lyft','cab_type_Uber'],axis=1)
print(data.shape)
data.head()

Checking for the null values in all featues

In [None]:
data.isnull().sum()

Price column has 55095 empty values, Not going imputing them as imputing predictand values results is getting more error values and less accuracy. So removing all the observations where price is empty

In [None]:
data = data.dropna(subset = ['price']).reset_index()

In [None]:
data.isnull().sum()

In [None]:
data.shape

Checking for outliers. Checking Min and Max threshold values and Plotting box plot on Price column..

In [None]:
max_threshold = data['price'].quantile(0.99)
max_threshold

In [None]:
data[data['price']>max_threshold]

In [None]:
min_threshold = data['price'].quantile(0.01)
min_threshold

In [None]:
data[data['price']<min_threshold]

In [None]:
sb.boxplot(data['price'])

Values above 43 are the outliers of our dataset. Checking the indexes and count of those observtions:

In [None]:
out = np.where(data['price']>43)
print(out[0])
print(np.count_nonzero(np.where(data['price']>43)))

Removing all the 5357 rows in that array, as if these outliers are included, error value will increase

In [None]:
data.drop(out[0], inplace = True)
data.shape

Checking for skewness across all features

In [None]:
from scipy.stats import skew
cols = ['distance','surge_multiplier']
for col in cols:
    print(col)
    print(skew(data[col]))
    plt.figure()
    sb.distplot(data[col])
    plt.show()

Distance and Surge multiplier are very highly skewed with values 0.77 and 8.84 respectively. Cross checking the correlation values of these columns with predictant(price).

In [None]:
#plotting heatmap with correlation on data_new dataframe
cols = ['price','distance','surge_multiplier']
data_new = data[cols]

plt.figure(figsize=(8,8))
sb.heatmap(data_new.corr(),annot=True, mask=np.triu(data_new.corr()))

Surge multiplier has a very high skew value and less correlation with Price, whereas distance has decent correlation. So removing skewness from Surge multiplier alone using BoxCox transform

In [None]:
from scipy import stats

data['surge_multiplier'] = stats.boxcox(data['surge_multiplier'])[0]
pd.Series(data['surge_multiplier']).skew()

Skewness of Surge_multiplier reduced from 8.84 to 5.64

Splitting data for training and testing

In [None]:
from sklearn.model_selection import train_test_split
y = data['price'].values
X = data.drop(columns=['price'],axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

Performing linear regression on the trained data

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
model = lin_reg.fit(X_train,y_train)
y_pred=model.predict(X_test)

Checking R2 value for linear regression

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

Calucating the root mean sqaured error for linear regression

In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_pred)
rootmeansq = np.sqrt(mse)
print(mse)
print(rootmeansq)

Performing XG Boost regression on the trained data

Checking R2 value for xg boost regression

In [None]:
import xgboost as xg

xgb_r = xg.XGBRegressor(objective ='reg:linear',n_estimators = 10, seed = 123)
xgb_r.fit(X_train, y_train)
y_pred = xgb_r.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

Calucating the root mean sqaured error for xg boost regression

In [None]:
mse = mean_squared_error(y_test,y_pred)
rootmeansq = np.sqrt(mse)
print(mse)
print(rootmeansq)

**XG Boost regressor** gave the best accuracy of 0.9454 and minimum RMSE value of 1.9870