# Uber and Lyft Dataset Boston, MA - Feature Engineering 

In [None]:
# Work with Arrays and linear algebra
import numpy as np 

# Data processing
import pandas as pd 

# Data visualizations 
import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
rideshare = pd.read_csv('/kaggle/input/uber-and-lyft-dataset-boston-ma/rideshare_kaggle.csv')
rideshare.shape
#Data set has 693071 observations and 57 features.

In [None]:
rideshare.info()

# Nulvalues or duplicated rows ?

 Checking if there are duplicated rows

In [None]:
print('{} of data is duplicated rows'.format(
    str(round(rideshare.duplicated().sum() / rideshare.size * 100,5))+'%'))

Checking if there are Nulvalues

In [None]:
print('{} of data is NuLL'.format(
    str(round(rideshare.isna().sum().sum() / rideshare.size * 100,3))+'%'))

In [None]:
cols = rideshare.columns
sb.heatmap(rideshare[cols].isnull())

Drop Null Values

In [None]:
rideshare.dropna(axis=0,inplace=True)

In [None]:
print('{} of data is NuLL after dropping'.format(
    str(round(rideshare.isna().sum().sum() / rideshare.size * 100,3))+'%'))

# Delete some features using Domain knowledge

We want to prodict the (price) so we Remove the following features Because it has no effect on the price of the trip

In [None]:
rideshare = rideshare.drop(['id','timestamp','long_summary','apparentTemperatureHighTime','apparentTemperatureLowTime',
                  'apparentTemperatureLowTime','windGustTime','sunriseTime','sunsetTime','uvIndexTime','temperatureMinTime',
                 'temperatureMaxTime','apparentTemperatureMinTime','temperatureLowTime','apparentTemperatureMaxTime'
                 ,'latitude','longitude','product_id'],axis = 1)
print(rideshare.shape)
rideshare.head()

In [None]:
rideshare['year'] = rideshare['datetime'].apply(lambda x: x.split('-')[0])

In [None]:
rideshare["year"].unique()

Delete data time and Year because all data in the same year

In [None]:
rideshare = rideshare.drop(['datetime','year'],axis = 1)
print(rideshare.shape)
rideshare.head()

# Checking the correlation between Price and Temperature related features

In [None]:
rideshare_new = rideshare[['price','temperature','apparentTemperature','temperatureHigh','temperatureLow','apparentTemperatureHigh',
                'apparentTemperatureLow','temperatureMin','temperatureHighTime','temperatureMax','apparentTemperatureMin','apparentTemperatureMax']]
rideshare_new.head()

In [None]:
#plotting relation between correlation and rideshare_new dataframe
correlation=rideshare_new.corr()[['price']]
price_corr=correlation[(correlation['price']>0.0001)|(correlation['price']<-0.0001)] #OR
price_corr.plot.bar() 

### Removing all the temperature related features from the dataframe as they have very weak correlation with the predictand (price)

In [None]:
rideshare = rideshare.drop(rideshare_new.columns[1:],axis=1)
rideshare.head(2)

# Analyzing features with numeric data types

In [None]:
num_col = rideshare.select_dtypes(include=['int64','float64']).columns.tolist()
rideshare_new = rideshare[num_col]
rideshare_new.columns

## Analyzing climate related features with price column

In [None]:
cli_col = ['price','precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'visibility', 'dewPoint', 'pressure', 'windBearing',
       'cloudCover', 'uvIndex', 'visibility.1', 'ozone', 'moonPhase',
       'precipIntensityMax']
rideshare_new = rideshare[cli_col]
rideshare_new.head()

In [None]:
#plotting relation between correlation and rideshare_new dataframe
correlation=rideshare_new.corr()[['price']]
price_corr=correlation[(correlation['price']>0.0001)|(correlation['price']<-0.0001)] #OR
price_corr.plot.bar() 

### Removing all the climate related features from the dataframe as they have very weak correlation with the predictand (price)

In [None]:
rideshare = rideshare.drop(cli_col[1:],axis=1)

rideshare.head()

In [None]:
rideshare.shape

# Analyzing data in the categorical features

In [None]:
cat_col = rideshare.select_dtypes(include=['object','category']).columns.tolist()
rideshare_new = rideshare[cat_col]
rideshare_new.head()

In [None]:
#Checking for unique values in all categorical columns
for col in rideshare_new:
    print(f'{col} : {rideshare_new[col].unique()}')
    print()

In [None]:
# Removing timezone column as it has only value across all data
rideshare = rideshare.drop(['timezone'],axis=1)

## Encoding all the category columns with the Onehot Encoder

In [None]:
# Reset index before using Onehot Encoder
rideshare = rideshare.reset_index(drop=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_col = rideshare.select_dtypes(include=['object','category']).columns.tolist()
print(cat_col)

#intiate OneHotEncoder and concatinating original rideshare with encoded column rideshare's
for col in cat_col:
    encoder = OneHotEncoder(handle_unknown='ignore')
    enc_rideshare = pd.DataFrame(encoder.fit_transform(rideshare[[col]]).toarray())
    enc_rideshare.columns = encoder.get_feature_names_out([col])
    rideshare = rideshare.drop(col, axis=1)
    rideshare = pd.concat([rideshare, enc_rideshare], axis=1)

In [None]:
rideshare.isna().sum().sum()

In [None]:
rideshare.columns

In [None]:
rideshare.head()

### Analyzing source column values with price

In [None]:
src_col = ['price','source_Back Bay', 'source_Beacon Hill', 'source_Boston University',
       'source_Fenway', 'source_Financial District', 'source_Haymarket Square',
       'source_North End', 'source_North Station',
       'source_Northeastern University', 'source_South Station',
       'source_Theatre District', 'source_West End']
rideshare_new = rideshare[src_col]
rideshare_new.head()

In [None]:
#plotting relation between correlation and rideshare_new dataframe
correlation=rideshare_new.corr()[['price']]
price_corr=correlation[(correlation['price']>0.0001)|(correlation['price']<-0.0001)] #OR
price_corr.plot.bar() 

### Source have small impact on the price, so we will drop it

In [None]:
rideshare = rideshare.drop(src_col[1:],axis=1)
print(rideshare.shape)
rideshare.head()

### Analyzing destination column values with price

In [None]:
dst_col = ['price','destination_Back Bay',
       'destination_Beacon Hill', 'destination_Boston University',
       'destination_Fenway', 'destination_Financial District',
       'destination_Haymarket Square', 'destination_North End',
       'destination_North Station', 'destination_Northeastern University',
       'destination_South Station', 'destination_Theatre District',
       'destination_West End']
rideshare_new = rideshare[dst_col]
rideshare_new.head()

In [None]:
#plotting relation between correlation and rideshare_new dataframe
correlation=rideshare_new.corr()[['price']]
price_corr=correlation[(correlation['price']>0.0001)|(correlation['price']<-0.0001)] #OR
price_corr.plot.bar() 

### Correlation values of Destination are very low(almost 0), So removing them.

In [None]:
rideshare = rideshare.drop(dst_col[1:],axis=1)
print(rideshare.shape)
rideshare.head()

In [None]:
rideshare.columns

### Checking correlation of summary column with price

In [None]:
summ_cols = ['price','short_summary_ Drizzle ', 'short_summary_ Foggy ',
       'short_summary_ Light Rain ', 'short_summary_ Mostly Cloudy ',
       'short_summary_ Overcast ', 'short_summary_ Partly Cloudy ',
       'short_summary_ Possible Drizzle ', 'short_summary_ Rain ','short_summary_ Clear ']
rideshare_new = rideshare[summ_cols]
rideshare_new.head()

In [None]:
#plotting relation between correlation and rideshare_new dataframe
correlation=rideshare_new.corr()[['price']]
price_corr=correlation[(correlation['price']>0.0001)|(correlation['price']<-0.0001)] #OR
price_corr.plot.bar() 

### Summary has no impact on Price, so removing them from data frame

In [None]:
rideshare = rideshare.drop(summ_cols[1:],axis=1)
print(rideshare.shape)
rideshare.head()

## Analyzing ICON Columns

In [None]:
ico_cols = ['price','icon_ clear-day ', 'icon_ clear-night ', 'icon_ cloudy ', 'icon_ fog ',
       'icon_ partly-cloudy-day ', 'icon_ partly-cloudy-night ',
       'icon_ rain ']
rideshare_new = rideshare[ico_cols]
rideshare_new.head()

In [None]:
#plotting relation between correlation and rideshare_new dataframe
correlation=rideshare_new.corr()[['price']]
price_corr=correlation[(correlation['price']>0.0001)|(correlation['price']<-0.0001)] #OR
price_corr.plot.bar() 

Deleting all ICON related cols as it does not have any effect with Price

In [None]:
rideshare = rideshare.drop(ico_cols[1:],axis=1)
print(rideshare.shape)
rideshare.head()

## Analyzing name cols with price

In [None]:
nme_cols = ['price','name_Black', 'name_Black SUV',
       'name_Lux', 'name_Lux Black', 'name_Lux Black XL', 'name_Lyft',
       'name_Lyft XL', 'name_Shared', 'name_UberPool',
       'name_UberX', 'name_UberXL', 'name_WAV']
rideshare_new = rideshare[nme_cols]
rideshare_new.head()

In [None]:
#plotting relation between correlation and rideshare_new dataframe
correlation=rideshare_new.corr()[['price']]
price_corr=correlation[(correlation['price']>0.0001)|(correlation['price']<-0.0001)] #OR
price_corr.plot.bar() 

Some values of names do have effect on Price value.

In [None]:
rideshare.columns

### Analyzing remaining columns with Price

In [None]:
#plotting relartion between price and distance 
sb.lineplot( x=rideshare['distance'] , y=rideshare['price'])
plt.show()

In [None]:
rem_cols =['price','hour', 'day', 'month', 'distance', 'surge_multiplier','cab_type_Lyft','cab_type_Uber']
rideshare_new = rideshare[rem_cols]
rideshare_new.head()

In [None]:
#plotting relation between correlation and rideshare_new dataframe
correlation=rideshare_new.corr()[['price']]
price_corr=correlation[(correlation['price']>0.0001)|(correlation['price']<-0.0001)] #OR
price_corr.plot.bar() 

From above figure, month day, hour, but distance and surge_multiplier has good correlation with Price, So we will drop weakly correlated columns

In [None]:
rideshare = rideshare.drop(['month','day','hour'],axis=1)
print(rideshare.shape)
rideshare.head()

## Checking for outliers. Checking Min and Max threshold values and Plotting box plot on Price column..

<p align="center"><img src='https://i.stack.imgur.com/AXEzg.png' width="300"/>

In [None]:
max_threshold = rideshare['price'].quantile(0.99)
max_threshold

In [None]:
min_threshold = rideshare['price'].quantile(0.01)
min_threshold

In [None]:
 # check outliers of cost
sb.boxplot(data=rideshare,x='price',palette='rainbow',orient='h')

Values above 43 are the outliers of our dataset. Checking the indexes and count of those observtions:

In [None]:
out = np.where(rideshare['price'] > 43)

print(np.count_nonzero(np.where(rideshare['price'] > 43)))

print("Precentage of rides that above 43$ is {}%".format(
    round(np.count_nonzero(np.where(rideshare['price'] > 43))/rideshare.size, 6)))

Removing all the 5357 rows in that array, as if these outliers are included, error value will increase

In [None]:
rideshare.drop(out[0], inplace = True)
rideshare.shape

In [None]:
#plotting relation between correlation and rideshare_new dataframe
correlation=rideshare.corr()[['price']]
price_corr=correlation[(correlation['price']>0.0001)|(correlation['price']<-0.0001)] #OR
price_corr.plot.bar() 

In [None]:
rideshare.head()

In [None]:
rideshare.to_csv('./Final_rideshare.csv', index=False)