importing all these libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Importing Dataset

In [None]:
from google.colab import files
uploaded = files.upload()
dataset = pd.read_csv("UberDataset.csv")
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.info()

Data Preprocessing

In [None]:
# filling the null values
dataset['PURPOSE'].fillna("Not Mentioned", inplace=True)

In [None]:
# Changing to the date_time format
dataset['START_DATE'] = pd.to_datetime(dataset['START_DATE'],
									errors='coerce')
dataset['END_DATE'] = pd.to_datetime(dataset['END_DATE'],
									errors='coerce')

In [None]:
from datetime import datetime

dataset['date'] = pd.DatetimeIndex(dataset['START_DATE']).date
dataset['time'] = pd.DatetimeIndex(dataset['START_DATE']).hour

#changing into categories of day and night
dataset['day-night'] = pd.cut(x=dataset['time'],
							bins = [0,6,12,16,20,24],
							labels = ['Night','Morning','Afternoon','Evening','Night'],
							ordered=False)

In [None]:
# drop rows with null values
dataset.dropna(inplace=True)

In [None]:
# drop the duplicate rows
dataset.drop_duplicates(inplace=True)

In [None]:
dataset

Data Visualization

In [None]:
# This section identifies columns containing categorical data (object data type) in the dataset
obj = (dataset.dtypes == 'object') # identify object columns
object_cols = list(obj[obj].index) # Get list of column names with object data type

unique_values = {}
for col in object_cols:
  unique_values[col] = dataset[col].unique().size # Count unique values in each object column
unique_values # Print the dictionary containing counts

In [None]:
plt.figure(figsize=(10,3))

plt.subplot(1,2,1)
sns.countplot(dataset['CATEGORY'])
#plt.xticks(rotation=90)
plt.show()

In [None]:
dataset.CATEGORY.value_counts()

In [None]:
# Plot the distribution of categories in the 'CATEGORY' column
sns.countplot(dataset['CATEGORY'])
plt.show()

In [None]:
dataset.PURPOSE.value_counts()

In [None]:
# Plot the distribution of categories in the 'PURPOSE' column
sns.countplot(dataset['PURPOSE'])
plt.show()

In [None]:
dataset['day-night'].value_counts()

In [None]:
# Plot the distribution of categories in the 'day-night' column
sns.countplot(dataset['day-night'])
plt.show()
#plt.xticks(rotation=90)

In [None]:
# Create a new figure with adjusted size (12 inches wide, 5 inches high)
plt.figure(figsize=(12, 5))

# Create a countplot with hue to show distribution of PURPOSE within each CATEGORY
sns.countplot(data=dataset, x='PURPOSE', hue='CATEGORY')
plt.xticks(rotation=15) # Rotate x-axis labels for better readability
plt.show()

One Hot Encoding (categorical variables -> integer variables)

In [None]:
dataset.shape

In [None]:
pd.get_dummies(dataset).shape

In [None]:
pd.get_dummies(dataset.PURPOSE)

In [None]:
from sklearn.preprocessing import OneHotEncoder # Import
object_cols = ['CATEGORY', 'PURPOSE'] # categorical columns to be encoded
OH_encoder = OneHotEncoder(sparse=False) # Create a OneHotEncoder object
OH_cols = pd.DataFrame(OH_encoder.fit_transform(dataset[object_cols])) # Fit the encoder to the data and transform the categorical columns
OH_cols.index = dataset.index # Set the DataFrame index to match the original dataset
OH_cols.columns = OH_encoder.get_feature_names_out()
dataset_final = dataset.drop(object_cols, axis=1)
dataset = pd.concat([dataset_final, OH_cols], axis=1)

In [None]:
dataset

In [None]:
plt.figure(figsize=(12, 6))

# Generate a heatmap using seaborn
sns.heatmap(dataset.corr(), # Pass the correlation matrix of the dataset
			cmap='BrBG', # Use the 'BrBG' colormap for bipolar data visualization
			fmt='.2f', # Format values to display two decimal places
			linewidths=2,
			annot=True)

plt.show()

In [None]:
# Extract month information from 'START_DATE' column
dataset['MONTH'] = pd.DatetimeIndex(dataset['START_DATE']).month

# Create a dictionary for month label mapping
month_label = {1.0: 'Jan', 2.0: 'Feb', 3.0: 'Mar', 4.0: 'April',
               5.0: 'May', 6.0: 'June', 7.0: 'July', 8.0: 'Aug',
               9.0: 'Sep', 10.0: 'Oct', 11.0: 'Nov', 12.0: 'Dec'}
dataset["MONTH"] = dataset.MONTH.map(month_label)

# Count occurrences of each month
mon = dataset.MONTH.value_counts(sort=False)

# Month total rides count vs Month ride max count
df = pd.DataFrame({"MONTHS": mon.values, # Month labels
                   "VALUE COUNT": dataset.groupby('MONTH',
                                                  sort=False)['MILES'].max()}) # Maximum rides per month

p = sns.lineplot(data=df)
p.set(xlabel="MONTHS", ylabel="VALUE COUNT")

In [None]:
# Extract day of the week information
dataset['DAY'] = pd.DatetimeIndex(dataset['START_DATE']).weekday

# Labelling
day_label = {
	0: 'Mon', 1: 'Tues', 2: 'Wed', 3: 'Thus', 4: 'Fri', 5: 'Sat', 6: 'Sun'
}
dataset['DAY'] = dataset['DAY'].map(day_label)

In [None]:
# Count occurrences of each day of the week
day_label = dataset.DAY.value_counts()
sns.barplot(x=day_label.index, y=day_label);
plt.xlabel('DAY')
plt.ylabel('COUNT')

In [None]:
# Generate a distribution plot for the 'MILES' column in the dataset
sns.distplot(dataset['MILES'])
plt.show()

In [None]:
# Create a subset of the data focusing on rides with a distance less than 50 miles
sns.distplot(dataset[dataset['MILES']<50]['MILES'])
plt.show()