In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
uber = pd.read_csv("./../datasets/UberDrives.csv")

uber

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit
...,...,...,...,...,...,...,...
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site


### General info about dataset

In [9]:
uber.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   START_DATE*  1156 non-null   object 
 1   END_DATE*    1155 non-null   object 
 2   CATEGORY*    1155 non-null   object 
 3   START*       1155 non-null   object 
 4   STOP*        1155 non-null   object 
 5   MILES*       1156 non-null   float64
 6   PURPOSE*     653 non-null    object 
dtypes: float64(1), object(6)
memory usage: 63.3+ KB


### - Find `null` or missing values :

In [3]:
uber.isnull().sum()

START_DATE*      0
END_DATE*        1
CATEGORY*        1
START*           1
STOP*            1
MILES*           0
PURPOSE*       503
dtype: int64

### - Drop duplicate rows :

In [4]:
uber[uber.duplicated()]

## there is only 1 duplicate record

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
492,6/28/2016 23:34,6/28/2016 23:59,Business,Durham,Cary,9.9,Meeting


In [5]:
uber.drop_duplicates(inplace=True)

In [6]:
uber[uber.duplicated()]

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*


In [7]:
uber.drop([1155], inplace=True)

## just dropping last row, as its not needed

In [8]:
uber.columns = [c[:-1] for c in uber.columns]

uber

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
0,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit
...,...,...,...,...,...,...,...
1150,12/31/2016 1:07,12/31/2016 1:14,Business,Kar?chi,Kar?chi,0.7,Meeting
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site


<br>Now, our data is a bit edited as per our needs and column names look fine. Lets work on this data.

In [9]:
uber.describe(include="all")

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
count,1154,1154,1154,1154,1154,1154.0,652
unique,1154,1154,2,177,188,,10
top,1/1/2016 21:11,1/1/2016 21:17,Business,Cary,Cary,,Meeting
freq,1,1,1077,201,202,,186
mean,,,,,,10.567418,
std,,,,,,21.588452,
min,,,,,,0.5,
25%,,,,,,2.9,
50%,,,,,,6.0,
75%,,,,,,10.4,


### - Convert "START_DATE" and "END_DATE" columns to `datetime` formats

In [19]:
uber["START_DATE"].info()

## what's dtype of START_DATE column, before

<class 'pandas.core.series.Series'>
Int64Index: 1154 entries, 0 to 1154
Series name: START_DATE
Non-Null Count  Dtype 
--------------  ----- 
1154 non-null   object
dtypes: object(1)
memory usage: 18.0+ KB


In [20]:
uber["START_DATE"] = pd.to_datetime(uber["START_DATE"])

uber.head()

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
0,2016-01-01 21:11:00,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,2016-01-02 01:25:00,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,2016-01-02 20:25:00,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,2016-01-05 17:31:00,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,2016-01-06 14:42:00,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


In [21]:
uber["START_DATE"].info()

## what's dtype of START_DATE column, after

<class 'pandas.core.series.Series'>
Int64Index: 1154 entries, 0 to 1154
Series name: START_DATE
Non-Null Count  Dtype         
--------------  -----         
1154 non-null   datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 18.0 KB


In [22]:
uber["END_DATE"] = pd.to_datetime(uber["END_DATE"])

## similarly, changing dtype for `END_DATE` column

In [23]:
uber.head()

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce,Fort Pierce,5.0,
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


### - Determine invalid/erroneous data

1. A usecase of this can be that,

if `START_DATE == END_DATE` and other columns have values. This is obviously misleading to believe that, despite ride didn't happen, still miles has been covered.

So, we should eliminate such records.

In [35]:
uber[uber["START_DATE"] == uber["END_DATE"]]

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
751,2016-09-06 17:49:00,2016-09-06 17:49:00,Business,Unknown Location,Unknown Location,69.1,
761,2016-09-16 07:08:00,2016-09-16 07:08:00,Business,Unknown Location,Unknown Location,1.6,
798,2016-10-08 15:03:00,2016-10-08 15:03:00,Business,Karachi,Karachi,3.6,
807,2016-10-13 13:02:00,2016-10-13 13:02:00,Business,Islamabad,Islamabad,0.7,


In [38]:
uber.drop(index=uber[uber["START_DATE"] == uber["END_DATE"]].index,
          axis=0,
          inplace=True)

In [39]:
uber[uber["START_DATE"] == uber["END_DATE"]]

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE


### - Basics around Timestamp values

In [41]:
ts = uber["START_DATE"][0]

ts

Timestamp('2016-01-01 21:11:00')

In [44]:
ts.date()

datetime.date(2016, 1, 1)

In [46]:
ts.day

1

In [49]:
ts.month, ts.month_name()

(1, 'January')

In [51]:
ts.day_name()

'Friday'

In [52]:
ts.year

2016

In [55]:
ts.dayofweek

4

Monday is annotated as `0`, and so on.

- Mon => 0
- Tue => 1
- Wed => 2
- Thu => 3
- Fri => 4
- Sat => 5
- Sun => 6

In [56]:
ts.hour

21

In [58]:
ts.minute

11

In [61]:
ts.second

0

#### Access year values of all records in START_DATE column :

- use `.dt` i.e. datetime object, to access such values.

In [65]:
uber["START_DATE"].dt.month

## to get the months in every record, of this column

0        1
1        1
2        1
3        1
4        1
        ..
1150    12
1151    12
1152    12
1153    12
1154    12
Name: START_DATE, Length: 1150, dtype: int64

#### Create and separate out `year` and `month` values as new columns :

In [66]:
uber["year"]  = uber["START_DATE"].dt.year
uber["month"] = uber["START_DATE"].dt.month

uber.head()

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE,year,month
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain,2016,1
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce,Fort Pierce,5.0,,2016,1
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies,2016,1
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce,Fort Pierce,4.7,Meeting,2016,1
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit,2016,1


### - Analysis of "MILES" columns

In [68]:
uber["MILES"].max()

310.3

In [69]:
uber["MILES"].min()

0.5

In [70]:
uber["MILES"].std()

21.55235968026448

In [71]:
uber["MILES"].median()

6.0

In [72]:
uber["MILES"].mean()

10.538956521739129

In [74]:
np.percentile(uber["MILES"], 25)


## 25% of values in entire MILES column are less than 2.9 miles
## Here, 25 in parameters, is 25th percentile or 1st Quartile.

2.9

Insight : 25% of population is travelling less than 2.9 miles.

<br>

In [75]:
np.percentile(uber["MILES"], 75)

## 75% of values in entire MILES column are less than 10.4 miles.
## Here, 75 in parameters, is 75th percentile or 3rd Quartile.

10.4

Insight : 75% of population is travelling less than 10.4 miles.

<br>

In [76]:
IQR = np.percentile(uber["MILES"], 75) - np.percentile(uber["MILES"], 25)

IQR

## IQR => Inter-Quartile-Range

7.5

In [83]:
np.percentile(uber["MILES"], 50)

6.0

> 50th percentile is actually the Mean value.

#### Lets calculate Outlier records :

In [85]:
10.4 + 1.5 * IQR

21.65

So, all such records whose mile values are greater than 21.65, will be Outlier records.

Below 76 rows shall be our Outliers.

In [86]:
uber[uber["MILES"] > 21.65]

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE,year,month
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit,2016,1
25,2016-01-14 16:29:00,2016-01-14 17:05:00,Business,Houston,Houston,21.9,Customer Visit,2016,1
36,2016-01-20 13:25:00,2016-01-20 14:19:00,Business,Raleigh,Cary,40.2,Customer Visit,2016,1
62,2016-02-01 12:10:00,2016-02-01 12:43:00,Business,Chapel Hill,Cary,23.3,Customer Visit,2016,2
108,2016-02-16 03:21:00,2016-02-16 04:13:00,Business,Katunayaka,Unknown Location,43.7,Customer Visit,2016,2
...,...,...,...,...,...,...,...,...,...
979,2016-11-20 10:27:00,2016-11-20 11:32:00,Business,Cary,Cary,39.2,Between Offices,2016,11
1088,2016-12-21 20:56:00,2016-12-21 23:42:00,Business,Rawalpindi,Unknown Location,103.0,Meeting,2016,12
1089,2016-12-22 15:40:00,2016-12-22 16:38:00,Business,Unknown Location,Unknown Location,32.3,Meeting,2016,12
1092,2016-12-22 17:56:00,2016-12-22 18:29:00,Business,Unknown Location,Unknown Location,23.2,Meeting,2016,12
