# Py training w/ Ride share data

## **Pandas**

In [3]:
import pandas as pd

### Getting data

In [5]:
drivers = pd.read_csv(r"drivers.csv")
ratings = pd.read_csv(r"ratings.csv")
rides = pd.read_csv(r"rides.csv")
users = pd.read_csv(r"users.csv")
vehicles = pd.read_csv(r"vehicles.csv")

In [6]:
drivers.head()

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available
0,1,Virginia Johnson,284,3.44,30,True
1,2,Daniel Maddox,18,2.59,61,False
2,3,Patricia Salazar,298,4.93,75,True
3,4,Cameron Hinton DDS,49,2.59,12,True
4,5,Jennifer Ayala,138,4.69,71,True


In [7]:
ratings.head()

Unnamed: 0,rating_id,ride_id,user_id,rating_value,comments,rating_date
0,1,1,3046,3,Million onto whom everything.,2024-07-18 23:43:41
1,2,2,3352,5,,2024-04-15 07:00:17
2,3,3,658,4,Tonight another goal lead.,2024-04-21 20:25:52
3,4,4,7893,3,,2024-03-22 21:24:44
4,5,5,2466,4,,2024-01-11 21:13:34


In [8]:
rides.head()

Unnamed: 0,ride_id,user_id,start_location,end_location,ride_start_time,ride_end_time,distance_km,fare_amount,driver_id
0,1,4067,Larryborough,Norrisborough,2024-04-30 10:31:56,2024-04-30 11:05:56,2.03,3.33,145
1,2,9160,Williamston,Robertburgh,2024-09-11 12:07:26,2024-09-11 13:34:26,13.66,30.03,244
2,3,1990,Howeshire,Christopherland,2024-04-30 10:21:25,2024-04-30 10:40:25,5.5,15.67,234
3,4,1268,North Annton,East Heather,2024-04-12 04:06:58,2024-04-12 05:07:58,31.74,53.9,145
4,5,7296,South Joseph,Lake Samuelberg,2024-04-26 16:45:45,2024-04-26 18:31:45,13.54,38.4,39


In [9]:
users.head()

Unnamed: 0,user_id,name,email,phone_number,registration_date,age,gender,location
0,1,Tammy Jones,trevinojacqueline@example.net,530-318-6905x290,2024-07-29 12:00:05,21,Other,New Alanside
1,2,Tyler Mathews,tim47@example.com,(222)541-3246x834,2021-10-04 14:21:09,53,Male,New Virginiabury
2,3,Melissa Robertson,scott64@example.org,309-644-3534x739,2022-10-07 04:46:44,57,Male,Ethanmouth
3,4,Heather Wolf,rushandrew@example.net,998-772-1220x9334,2022-07-10 14:01:22,50,Male,South Theresaland
4,5,Daniel Daniel,robert13@example.org,+1-992-853-1435,2020-05-21 04:37:46,32,Female,New James


In [10]:
vehicles.head()

Unnamed: 0,vehicle_id,make,model,year,capacity
0,1,"Smith, Williams and Huff",positive,2009,3
1,2,"Silva, Miller and Townsend",glass,2003,6
2,3,Mitchell LLC,north,2005,7
3,4,Hill LLC,leave,2016,7
4,5,Smith Group,glass,2011,6


### Handling missing values

In [12]:
drivers.isnull().sum()

driver_id      0
name           0
vehicle_id     0
rating         0
total_rides    0
available      0
dtype: int64

In [13]:
ratings.isnull().sum()

rating_id           0
ride_id             0
user_id             0
rating_value        0
comments        25103
rating_date         0
dtype: int64

In [14]:
rides.isnull().sum()

ride_id            0
user_id            0
start_location     0
end_location       0
ride_start_time    0
ride_end_time      0
distance_km        0
fare_amount        0
driver_id          0
dtype: int64

In [15]:
users.isnull().sum()

user_id              0
name                 0
email                0
phone_number         0
registration_date    0
age                  0
gender               0
location             0
dtype: int64

In [16]:
vehicles.isnull().sum()

vehicle_id    0
make          0
model         0
year          0
capacity      0
dtype: int64

If there was missing values (example): <br/>
df1 = df.dropna(subset=['CREDIT_LIMIT’]) <br/>
df2['MINIMUM_PAYMENTS’] = df['MINIMUM_PAYMENTS'].fillna(0)

### Handling duplicates

In [19]:
drivers.duplicated().sum()

0

In [20]:
drivers['driver_id'].drop_duplicates(inplace = True)

**Efficient way**

In [22]:
dfs = [drivers, ratings, rides, users, vehicles]

In [23]:
duplicate_counts = [df.duplicated().sum() for df in dfs]
duplicate_counts

[0, 0, 0, 0, 0]

In [24]:
duplicate_counts = list(map(lambda df: df.duplicated().sum(), dfs))
duplicate_counts

[0, 0, 0, 0, 0]

`lambda x: fn` <br/>
`map(fn, iterable)` | applies fn to each element in iterable (list or tuple) <br/>
`list(...)` converts the result from map() into a list.

To drop duplicates use `df.drop_duplicates()` or `df['col'].drop_duplicates()`

### Data Transformation | Categorization

In [28]:
drivers.loc[drivers['rating'] >= 4, 'Status'] = "Elite"

In [29]:
drivers.head()

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available,Status
0,1,Virginia Johnson,284,3.44,30,True,
1,2,Daniel Maddox,18,2.59,61,False,
2,3,Patricia Salazar,298,4.93,75,True,Elite
3,4,Cameron Hinton DDS,49,2.59,12,True,
4,5,Jennifer Ayala,138,4.69,71,True,Elite


In [30]:
drivers.loc[drivers['rating'] <= 2, 'Status'] = "Poor"

In [31]:
drivers.head(10)

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available,Status
0,1,Virginia Johnson,284,3.44,30,True,
1,2,Daniel Maddox,18,2.59,61,False,
2,3,Patricia Salazar,298,4.93,75,True,Elite
3,4,Cameron Hinton DDS,49,2.59,12,True,
4,5,Jennifer Ayala,138,4.69,71,True,Elite
5,6,Sandra Torres,45,3.05,164,True,
6,7,Melissa Perry,288,3.43,23,False,
7,8,Kathleen Compton,16,1.69,127,False,Poor
8,9,Jake Brown,176,2.69,117,True,
9,10,Audrey Price,152,2.42,85,False,


**handling missing data | `df['col'] = fillna(val, inplace = True)`**

In [33]:
drivers['Status'].fillna("Average", inplace = True)

In [34]:
drivers.head()

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available,Status
0,1,Virginia Johnson,284,3.44,30,True,
1,2,Daniel Maddox,18,2.59,61,False,
2,3,Patricia Salazar,298,4.93,75,True,Elite
3,4,Cameron Hinton DDS,49,2.59,12,True,
4,5,Jennifer Ayala,138,4.69,71,True,Elite


### Data Transformation | Calculations

#### Calculating the time of rides

In [37]:
rides_time = rides.copy()

In [38]:
#rides_time['Time'] = rides['ride_end_time'] - rides['ride_start_time']
type(rides_time)

pandas.core.frame.DataFrame

In [39]:
rides_time.dtypes

ride_id              int64
user_id              int64
start_location      object
end_location        object
ride_start_time     object
ride_end_time       object
distance_km        float64
fare_amount        float64
driver_id            int64
dtype: object

**Convert multiple columns to date time type**

In [41]:
rides_time[['ride_end_time','ride_start_time']] = rides_time[['ride_end_time','ride_start_time']].apply(pd.to_datetime)

In [42]:
rides_time.dtypes

ride_id                     int64
user_id                     int64
start_location             object
end_location               object
ride_start_time    datetime64[ns]
ride_end_time      datetime64[ns]
distance_km               float64
fare_amount               float64
driver_id                   int64
dtype: object

In [43]:
rides_time['Time'] = rides_time['ride_end_time'] - rides_time['ride_start_time']

In [44]:
rides_time.head()

Unnamed: 0,ride_id,user_id,start_location,end_location,ride_start_time,ride_end_time,distance_km,fare_amount,driver_id,Time
0,1,4067,Larryborough,Norrisborough,2024-04-30 10:31:56,2024-04-30 11:05:56,2.03,3.33,145,0 days 00:34:00
1,2,9160,Williamston,Robertburgh,2024-09-11 12:07:26,2024-09-11 13:34:26,13.66,30.03,244,0 days 01:27:00
2,3,1990,Howeshire,Christopherland,2024-04-30 10:21:25,2024-04-30 10:40:25,5.5,15.67,234,0 days 00:19:00
3,4,1268,North Annton,East Heather,2024-04-12 04:06:58,2024-04-12 05:07:58,31.74,53.9,145,0 days 01:01:00
4,5,7296,South Joseph,Lake Samuelberg,2024-04-26 16:45:45,2024-04-26 18:31:45,13.54,38.4,39,0 days 01:46:00


In [45]:
# rides_time['Time'].sort_values(ascending = False) # checking

#### Analyze each driver

In [47]:
rides_time.groupby('driver_id')['Time'].sum().head()

driver_id
1   8 days 05:23:00
2   8 days 01:43:00
3   7 days 20:05:00
4   7 days 03:24:00
5   7 days 05:34:00
Name: Time, dtype: timedelta64[ns]

In [48]:
rides_time.groupby('driver_id')['Time'].sum().reset_index().head()

Unnamed: 0,driver_id,Time
0,1,8 days 05:23:00
1,2,8 days 01:43:00
2,3,7 days 20:05:00
3,4,7 days 03:24:00
4,5,7 days 05:34:00


In [49]:
rides_time.groupby('driver_id')['fare_amount'].mean().reset_index().sort_values(by = 'fare_amount', ascending = False).head()

Unnamed: 0,driver_id,fare_amount
164,165,65.98994
54,55,65.453357
32,33,63.633315
36,37,63.553099
291,292,63.353756


In [50]:
rides_time.groupby('driver_id')['fare_amount'].agg(['mean', 'sum']).sort_values(by = 'mean', ascending = False).head()

Unnamed: 0_level_0,mean,sum
driver_id,Unnamed: 1_level_1,Unnamed: 2_level_1
165,65.98994,11020.32
55,65.453357,9359.83
33,63.633315,11708.53
37,63.553099,9024.54
292,63.353756,12480.69


In [51]:
rides_time.groupby('driver_id')['fare_amount'].agg(['mean', 'sum']).sort_values(by = 'sum', ascending = False).head()

Unnamed: 0_level_0,mean,sum
driver_id,Unnamed: 1_level_1,Unnamed: 2_level_1
292,63.353756,12480.69
33,63.633315,11708.53
85,61.225737,11632.89
95,60.812526,11554.38
238,61.111058,11549.99


In [52]:
rides_grouped = rides_time.groupby('driver_id').agg(
    total_time = ('Time', 'sum'),
    average_fare = ('fare_amount', 'mean')
).reset_index()

In [53]:
rides_grouped.sort_values(by = 'total_time', ascending = False).head()

Unnamed: 0,driver_id,total_time,average_fare
148,149,8 days 20:48:00,56.08701
14,15,8 days 19:36:00,56.030876
291,292,8 days 19:12:00,63.353756
94,95,8 days 15:46:00,60.812526
170,171,8 days 14:58:00,56.554462


### Categorization Efficient way | with `lambda` and `apply()`

In [55]:
drivers = pd.read_csv(r"drivers.csv")

In [56]:
drivers.head()

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available
0,1,Virginia Johnson,284,3.44,30,True
1,2,Daniel Maddox,18,2.59,61,False
2,3,Patricia Salazar,298,4.93,75,True
3,4,Cameron Hinton DDS,49,2.59,12,True
4,5,Jennifer Ayala,138,4.69,71,True


**fn = lambda**

In [58]:
cat_status = lambda x: "Elite" if x >= 4 else "Poor" if x<=2 else "Average"

**.apply(fn)**

In [60]:
drivers['Status'] = drivers['rating'].apply(cat_status)

In [61]:
drivers.head()

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available,Status
0,1,Virginia Johnson,284,3.44,30,True,Average
1,2,Daniel Maddox,18,2.59,61,False,Average
2,3,Patricia Salazar,298,4.93,75,True,Elite
3,4,Cameron Hinton DDS,49,2.59,12,True,Average
4,5,Jennifer Ayala,138,4.69,71,True,Elite


### Grouping

In [63]:
drivers.groupby(['Status']).size()

Status
Average    139
Elite       79
Poor        82
dtype: int64

In [64]:
drivers.groupby(['Status','available']).size()

Status   available
Average  False        76
         True         63
Elite    False        37
         True         42
Poor     False        42
         True         40
dtype: int64

In [65]:
rides.shape

(50000, 9)

In [66]:
rides.groupby('driver_id').size().sort_values(ascending = False).head()

driver_id
90     201
292    197
171    195
15     194
197    194
dtype: int64

### Filtering based on condition

**Find how many Elite are available**

In [69]:
elites_available = drivers.loc[(drivers['Status'] == "Elite") & (drivers['available'] == True)]
elites_available.head()

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available,Status
2,3,Patricia Salazar,298,4.93,75,True,Elite
4,5,Jennifer Ayala,138,4.69,71,True,Elite
21,22,Amber Lozano,48,4.89,100,True,Elite
44,45,Patricia Smith,8,4.09,84,True,Elite
52,53,Patricia Richardson,89,4.08,83,True,Elite


### Sorting

In [71]:
elites_available['total_rides'].sort_values(ascending = False).head()

90     193
69     183
220    179
251    177
289    173
Name: total_rides, dtype: int64

In [72]:
# elites_available[['rating','total_rides']].sort_values(ascending = False).head()
elites_available.sort_values(by = ['rating','total_rides'], ascending = False).head()

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available,Status
247,248,Joel Bryant,20,5.0,56,True,Elite
158,159,Robert Schwartz,179,4.99,21,True,Elite
99,100,Michael Chan,157,4.98,35,True,Elite
2,3,Patricia Salazar,298,4.93,75,True,Elite
286,287,Joanna West,216,4.92,64,True,Elite


In [73]:
elites_available.sort_values(by = ['total_rides', 'rating'], ascending = False).head()

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available,Status
90,91,Vincent Ford,195,4.82,193,True,Elite
69,70,David Chavez,206,4.1,183,True,Elite
220,221,Kristina Morgan,49,4.28,179,True,Elite
251,252,Todd Brown,93,4.83,177,True,Elite
289,290,Angela Li,23,4.9,173,True,Elite


### Ranks

In [75]:
elites_available['rating'].rank(method = 'dense', ascending = False).sort_values().head(10)

247     1.0
158     2.0
99      3.0
2       4.0
286     5.0
289     6.0
21      7.0
195     8.0
251     9.0
90     10.0
Name: rating, dtype: float64

dense No skip 1,1,2 <br/>
min skips 1,1,3 <br/>
first is row_number or sequential 1,2,3 

#### **ranks with over(order by 'col2') or over(parition by 'col' order by 'col2')**

**partition by = groupby**

In [79]:
rides_grouped_ranked = rides_time.groupby('driver_id').agg(
    total_time = ('Time', 'sum'),
    average_fare = ('fare_amount', 'mean')
).reset_index()

In [80]:
rides_grouped.head()

Unnamed: 0,driver_id,total_time,average_fare
0,1,8 days 05:23:00,56.53676
1,2,8 days 01:43:00,56.057062
2,3,7 days 20:05:00,55.732418
3,4,7 days 03:24:00,55.837561
4,5,7 days 05:34:00,56.990599


In [81]:
# rides_grouped['total_time'].rank(method = 'dense', ascending = False).reset_index(name = 'rank').sort_values(by = 'rank')

**over(order by ) = .rank(ascending = False)**

In [83]:
rides_grouped['rank'] = rides_grouped['total_time'].rank(method = 'dense', ascending = False)

In [84]:
rides_grouped.head()

Unnamed: 0,driver_id,total_time,average_fare,rank
0,1,8 days 05:23:00,56.53676,23.0
1,2,8 days 01:43:00,56.057062,31.0
2,3,7 days 20:05:00,55.732418,52.0
3,4,7 days 03:24:00,55.837561,160.0
4,5,7 days 05:34:00,56.990599,143.0


sort by rank:

In [86]:
rides_grouped.sort_values(by = 'rank').head(10)

Unnamed: 0,driver_id,total_time,average_fare,rank
148,149,8 days 20:48:00,56.08701,1.0
14,15,8 days 19:36:00,56.030876,2.0
291,292,8 days 19:12:00,63.353756,3.0
94,95,8 days 15:46:00,60.812526,4.0
170,171,8 days 14:58:00,56.554462,5.0
246,247,8 days 13:00:00,54.153867,6.0
81,82,8 days 12:01:00,59.700973,7.0
224,225,8 days 11:43:00,53.596223,8.0
282,283,8 days 11:33:00,58.090314,9.0
89,90,8 days 11:33:00,57.264428,9.0


**to view top 10**

In [88]:
rides_grouped[rides_grouped['rank']<=10].sort_values(by ='rank')

Unnamed: 0,driver_id,total_time,average_fare,rank
148,149,8 days 20:48:00,56.08701,1.0
14,15,8 days 19:36:00,56.030876,2.0
291,292,8 days 19:12:00,63.353756,3.0
94,95,8 days 15:46:00,60.812526,4.0
170,171,8 days 14:58:00,56.554462,5.0
246,247,8 days 13:00:00,54.153867,6.0
81,82,8 days 12:01:00,59.700973,7.0
224,225,8 days 11:43:00,53.596223,8.0
89,90,8 days 11:33:00,57.264428,9.0
282,283,8 days 11:33:00,58.090314,9.0


### Joins

In [90]:
drivers.shape

(300, 7)

In [91]:
vehicles.shape

(300, 5)

#### 1. `merge()` (SQL-like Joins)

In [93]:
drivers_vehicles = pd.merge(drivers, vehicles, how = "left", on = 'vehicle_id')

In [94]:
drivers_vehicles.head(10)

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available,Status,make,model,year,capacity
0,1,Virginia Johnson,284,3.44,30,True,Average,"Anderson, Miller and Fowler",road,2019,3
1,2,Daniel Maddox,18,2.59,61,False,Average,"Jones, Rice and Hill",three,2003,3
2,3,Patricia Salazar,298,4.93,75,True,Elite,Hansen Ltd,brother,2014,4
3,4,Cameron Hinton DDS,49,2.59,12,True,Average,Brooks LLC,time,2003,4
4,5,Jennifer Ayala,138,4.69,71,True,Elite,"Navarro, Lucas and Potts",cut,2009,2
5,6,Sandra Torres,45,3.05,164,True,Average,Martinez-Case,organization,2009,7
6,7,Melissa Perry,288,3.43,23,False,Average,Bowers-Stone,brother,2014,4
7,8,Kathleen Compton,16,1.69,127,False,Poor,Pennington-Sanchez,discover,2007,3
8,9,Jake Brown,176,2.69,117,True,Average,"Baxter, Mathews and Pace",ball,2012,7
9,10,Audrey Price,152,2.42,85,False,Average,Patton-Cervantes,past,2019,4


#### 2. `concat()` (Stacking Datasets)

In [96]:
# Stack datasets vertically (like UNION in SQL)
df_concat = pd.concat([drivers, vehicles], axis=0)  # axis=0 → Rows
df_concat.tail()

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available,Status,make,model,year,capacity
295,,,296,,,,,"Thomas, Martin and Williams",above,2015.0,7.0
296,,,297,,,,,Brennan Inc,traditional,2020.0,5.0
297,,,298,,,,,Hansen Ltd,brother,2014.0,4.0
298,,,299,,,,,Garcia-Marsh,drug,2003.0,7.0
299,,,300,,,,,"Pena, Stone and Tapia",director,2008.0,4.0


In [97]:
# Stack datasets vertically (like UNION in SQL)
df_concat = pd.concat([drivers, vehicles], axis=1)  # axis=1 → Columns
df_concat.head(10)

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available,Status,vehicle_id.1,make,model,year,capacity
0,1,Virginia Johnson,284,3.44,30,True,Average,1,"Smith, Williams and Huff",positive,2009,3
1,2,Daniel Maddox,18,2.59,61,False,Average,2,"Silva, Miller and Townsend",glass,2003,6
2,3,Patricia Salazar,298,4.93,75,True,Elite,3,Mitchell LLC,north,2005,7
3,4,Cameron Hinton DDS,49,2.59,12,True,Average,4,Hill LLC,leave,2016,7
4,5,Jennifer Ayala,138,4.69,71,True,Elite,5,Smith Group,glass,2011,6
5,6,Sandra Torres,45,3.05,164,True,Average,6,Gilbert Group,city,2022,2
6,7,Melissa Perry,288,3.43,23,False,Average,7,"Cook, Gregory and Smith",later,2018,4
7,8,Kathleen Compton,16,1.69,127,False,Poor,8,"Lopez, Tucker and Perry",just,2022,4
8,9,Jake Brown,176,2.69,117,True,Average,9,Whitaker-Munoz,specific,2008,6
9,10,Audrey Price,152,2.42,85,False,Average,10,"Anderson, James and Dominguez",contain,2005,6


### Transpose

In [99]:
elites_available.T

Unnamed: 0,2,4,21,44,52,69,72,74,85,86,...,223,226,228,247,251,258,269,286,289,290
driver_id,3,5,22,45,53,70,73,75,86,87,...,224,227,229,248,252,259,270,287,290,291
name,Patricia Salazar,Jennifer Ayala,Amber Lozano,Patricia Smith,Patricia Richardson,David Chavez,Krista Burke,Jessica Curry,Shaun Gardner,Tyler Knapp,...,Joe Henderson,Susan Maxwell,Lance Mcconnell,Joel Bryant,Todd Brown,Chad Hughes,Stephanie Villarreal,Joanna West,Angela Li,Natasha Ramirez
vehicle_id,298,138,48,8,89,206,203,132,3,276,...,221,77,217,20,93,103,114,216,23,241
rating,4.93,4.69,4.89,4.09,4.08,4.1,4.52,4.78,4.64,4.76,...,4.16,4.68,4.22,5.0,4.83,4.27,4.79,4.92,4.9,4.74
total_rides,75,71,100,84,83,183,131,148,44,126,...,123,48,52,56,177,84,127,64,173,137
available,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
Status,Elite,Elite,Elite,Elite,Elite,Elite,Elite,Elite,Elite,Elite,...,Elite,Elite,Elite,Elite,Elite,Elite,Elite,Elite,Elite,Elite


### Finding index

In [101]:
# Find the row index where vehicle_id is 298
index = drivers_vehicles[drivers_vehicles['vehicle_id'] == 298].index

print("Index of value 298 in vehicle_id column:", index)

Index of value 298 in vehicle_id column: Index([2], dtype='int64')


In [102]:
# Find the row index where driver_id is 2
index = rides[rides['driver_id'] == 2].index

print("Index of value 2 in driver_id column:", index)

Index of value 2 in driver_id column: Index([  322,   336,   477,   942,  1085,  1229,  1692,  1804,  2206,  2242,
       ...
       47372, 47537, 47598, 47703, 48061, 48199, 48765, 49002, 49105, 49789],
      dtype='int64', length=177)


**count = (condition.sum())**

In [104]:
# Count the number of occurrences of 298 in driver_id column
count_298 = (rides['driver_id'] == 298).sum()
count_298

151

## **NumPy**

In [106]:
import numpy as np

In [107]:
drivers2 = pd.read_csv(r"drivers.csv")

### Data transformation

In [109]:
drivers2['Elite'] = np.where(drivers2['rating']>=4, "Elite", "Not Elite")

In [110]:
drivers2.head()

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available,Elite
0,1,Virginia Johnson,284,3.44,30,True,Not Elite
1,2,Daniel Maddox,18,2.59,61,False,Not Elite
2,3,Patricia Salazar,298,4.93,75,True,Elite
3,4,Cameron Hinton DDS,49,2.59,12,True,Not Elite
4,5,Jennifer Ayala,138,4.69,71,True,Elite


### Reverse a list

#### Getting a col as a list

In [113]:
col_list = drivers2['vehicle_id'].tolist()

In [114]:
#col_list

In [115]:
len(col_list)

300

**to get unique values**

In [117]:
u_col_list = drivers2['vehicle_id'].unique().tolist()

In [118]:
len(u_col_list)

189

**to view top 5 in a list**

In [120]:
u_col_list[:5]

[284, 18, 298, 49, 138]

**Frequency of unique values in a column**

In [122]:
drivers2['vehicle_id'].value_counts().head(10)

vehicle_id
138    5
37     5
274    5
89     4
132    4
49     4
8      3
91     3
289    3
115    3
Name: count, dtype: int64

- Returns: A Series where the index represents the unique values and the values are the counts.
- By default, it sorts the values in descending order of frequency.


**top 3 values in a list**

In [125]:
drivers2['vehicle_id'].value_counts().head(3)

vehicle_id
138    5
37     5
274    5
Name: count, dtype: int64

better to use rank!

#### REVERSING A LIST

In [128]:
u_col_list[:10]

[284, 18, 298, 49, 138, 45, 288, 16, 176, 152]

In [129]:
u_col_list[:10][::-1]

[152, 176, 16, 288, 45, 138, 49, 298, 18, 284]

### Convert a list to array

In [131]:
# Convert list to numpy array
arr = np.array(u_col_list)

In [132]:
type(arr), type(u_col_list)

(numpy.ndarray, list)

place cursor over a function or code then press Shift + Tab to get info or help

#### reverse an array

In [135]:
arr

array([284,  18, 298,  49, 138,  45, 288,  16, 176, 152, 269,  84,  23,
       169, 287,  80, 142, 293,   9,  48, 290,  37, 226, 171, 286, 289,
       115, 275, 167, 141, 236, 232,  98, 274, 228, 220, 175, 184,   8,
        95, 292, 180, 129,  81, 207, 148,  89, 159, 145, 224, 210, 217,
       283, 265, 105, 201,  67, 262, 206, 110, 203, 102, 132,  92, 257,
       215, 126, 166, 173, 144, 294,  70,   3, 276, 270, 172, 195, 106,
       240,  74, 229, 157,  58, 165, 253, 296, 247, 252,  29, 261, 187,
       264, 219,  33, 134, 272, 251, 198, 208,  39, 237,  79, 256,  52,
       127,   2, 221, 271, 279,  85, 235,  19, 204,   6,  40,  44, 174,
        72, 143, 227,  68,  28,  91, 179, 188, 266,  21, 194,  90,  36,
       242, 238, 190, 300, 151,  82, 140, 100, 119,  59, 123,  66,  12,
       114,  69, 101, 230, 146, 111, 168, 222, 113,  17,  88, 211,  97,
       149, 177,  77, 133, 255, 233, 199, 248,  14,  20,  99,  93, 153,
        51, 108, 103,  42, 186,  63,  10,  30, 260,  38, 121, 25

In [136]:
arr[::-1]

array([282,  22, 109,  73, 241, 161, 216,  26, 254, 121,  38, 260,  30,
        10,  63, 186,  42, 103, 108,  51, 153,  93,  99,  20,  14, 248,
       199, 233, 255, 133,  77, 177, 149,  97, 211,  88,  17, 113, 222,
       168, 111, 146, 230, 101,  69, 114,  12,  66, 123,  59, 119, 100,
       140,  82, 151, 300, 190, 238, 242,  36,  90, 194,  21, 266, 188,
       179,  91,  28,  68, 227, 143,  72, 174,  44,  40,   6, 204,  19,
       235,  85, 279, 271, 221,   2, 127,  52, 256,  79, 237,  39, 208,
       198, 251, 272, 134,  33, 219, 264, 187, 261,  29, 252, 247, 296,
       253, 165,  58, 157, 229,  74, 240, 106, 195, 172, 270, 276,   3,
        70, 294, 144, 173, 166, 126, 215, 257,  92, 132, 102, 203, 110,
       206, 262,  67, 201, 105, 265, 283, 217, 210, 224, 145, 159,  89,
       148, 207,  81, 129, 180, 292,  95,   8, 184, 175, 220, 228, 274,
        98, 232, 236, 141, 167, 275, 115, 289, 286, 171, 226,  37, 290,
        48,   9, 293, 142,  80, 287, 169,  23,  84, 269, 152, 17

### Array Indexing and Slicing

In [138]:
arr[-2] # gives 2nd last

22

In [139]:
arr[3:6] # 3 index inclusive and 6 index exclusive so index 3,4,5

array([ 49, 138,  45])

### Array Reshaping

In [141]:
arr.shape

(189,)

Shape (189,): A **1-dimensional array** with 189 elements.

In [143]:
type(arr)

numpy.ndarray

In [144]:
#reshaped_arr = arr.reshape(1, 3)  # Reshape to 3x3 matrix
#print(reshaped_arr)

For other shapes: Ensure that the product of the new shape's dimensions equals 189. For example:

In [146]:
reshaped_arr = arr.reshape(1, 189)

In [147]:
reshaped_arr.shape

(1, 189)

Shape **(1, 189)** is a **row vector** (a 2D array with 1 row and 189 columns).

In [149]:
reshaped_arr2 = arr.reshape(189,1)

In [150]:
reshaped_arr2.shape

(189, 1)

Shape **(189, 1)** is a **column vector** (a 2D array with 189 rows and 1 column).

### Converting to vectors
.reshape(R,C) <BR/>
arr.reshape(**-1**, 1) | **column vector** | ***FIGURE OUT ROWS (-1), I WANT COLUMN AS 1 SO 1D COLUMN VECTOR (1)***

When using **-1 in .reshape()**, **NumPy will figure out the appropriate size for that dimension** based on the total number of elements in the array and the other specified dimensions.

#### Convert to a column vector

In [155]:
# Convert into a column vector (189, 1)
reshaped_arr = arr.reshape(-1, 1)

In [156]:
reshaped_arr.shape

(189, 1)

#### Convert to a row vector

In [158]:
# Convert into a row vector
reshaped_arr = arr.reshape(1, -1)

In [159]:
reshaped_arr.shape

(1, 189)

### Array Broadcasting | ex: Adding a scalar to each element

In [161]:
reshaped_arr + 10000

array([[10284, 10018, 10298, 10049, 10138, 10045, 10288, 10016, 10176,
        10152, 10269, 10084, 10023, 10169, 10287, 10080, 10142, 10293,
        10009, 10048, 10290, 10037, 10226, 10171, 10286, 10289, 10115,
        10275, 10167, 10141, 10236, 10232, 10098, 10274, 10228, 10220,
        10175, 10184, 10008, 10095, 10292, 10180, 10129, 10081, 10207,
        10148, 10089, 10159, 10145, 10224, 10210, 10217, 10283, 10265,
        10105, 10201, 10067, 10262, 10206, 10110, 10203, 10102, 10132,
        10092, 10257, 10215, 10126, 10166, 10173, 10144, 10294, 10070,
        10003, 10276, 10270, 10172, 10195, 10106, 10240, 10074, 10229,
        10157, 10058, 10165, 10253, 10296, 10247, 10252, 10029, 10261,
        10187, 10264, 10219, 10033, 10134, 10272, 10251, 10198, 10208,
        10039, 10237, 10079, 10256, 10052, 10127, 10002, 10221, 10271,
        10279, 10085, 10235, 10019, 10204, 10006, 10040, 10044, 10174,
        10072, 10143, 10227, 10068, 10028, 10091, 10179, 10188, 10266,
      

### Matrix Multiplication

In [163]:
# Create two 2D arrays (matrices)
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

In [164]:
print(A)

[[1 2]
 [3 4]]


In [165]:
print(B)

[[5 6]
 [7 8]]


In [166]:
# Matrix multiplication using np.dot() or @
result = np.dot(A, B)
print(result)

[[19 22]
 [43 50]]


### Find min and max | values and index

In [168]:
# Find min and max values
min_val = np.min(arr)
max_val = np.max(arr)

In [169]:
# Find their indices
min_idx = np.argmin(arr)
max_idx = np.argmax(arr)

In [170]:
print("Min value:", min_val, "at index", min_idx)
print("Max value:", max_val, "at index", max_idx)

Min value: 2 at index 105
Max value: 300 at index 133


### Sum

In [172]:
# Sum of all elements
sum_val = np.sum(arr)
print("Sum:", sum_val)

Sum: 28946


### Element-wise operator

In [174]:
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])

# Element-wise addition
sum_arr = arr1 + arr2
print("Element-wise addition:", sum_arr)

# Element-wise multiplication
mul_arr = arr1 * arr2
print("Element-wise multiplication:", mul_arr)

Element-wise addition: [5 7 9]
Element-wise multiplication: [ 4 10 18]


### Find unique elements

In [176]:
# Find unique elements
unique_elements = np.unique(arr)
print("Unique elements:", unique_elements)

Unique elements: [  2   3   6   8   9  10  12  14  16  17  18  19  20  21  22  23  26  28
  29  30  33  36  37  38  39  40  42  44  45  48  49  51  52  58  59  63
  66  67  68  69  70  72  73  74  77  79  80  81  82  84  85  88  89  90
  91  92  93  95  97  98  99 100 101 102 103 105 106 108 109 110 111 113
 114 115 119 121 123 126 127 129 132 133 134 138 140 141 142 143 144 145
 146 148 149 151 152 153 157 159 161 165 166 167 168 169 171 172 173 174
 175 176 177 179 180 184 186 187 188 190 194 195 198 199 201 203 204 206
 207 208 210 211 215 216 217 219 220 221 222 224 226 227 228 229 230 232
 233 235 236 237 238 240 241 242 247 248 251 252 253 254 255 256 257 260
 261 262 264 265 266 269 270 271 272 274 275 276 279 282 283 284 286 287
 288 289 290 292 293 294 296 298 300]


### Sorting 

In [178]:
# Sort array
sorted_arr = np.sort(arr)
print("Sorted array:", sorted_arr)

Sorted array: [  2   3   6   8   9  10  12  14  16  17  18  19  20  21  22  23  26  28
  29  30  33  36  37  38  39  40  42  44  45  48  49  51  52  58  59  63
  66  67  68  69  70  72  73  74  77  79  80  81  82  84  85  88  89  90
  91  92  93  95  97  98  99 100 101 102 103 105 106 108 109 110 111 113
 114 115 119 121 123 126 127 129 132 133 134 138 140 141 142 143 144 145
 146 148 149 151 152 153 157 159 161 165 166 167 168 169 171 172 173 174
 175 176 177 179 180 184 186 187 188 190 194 195 198 199 201 203 204 206
 207 208 210 211 215 216 217 219 220 221 222 224 226 227 228 229 230 232
 233 235 236 237 238 240 241 242 247 248 251 252 253 254 255 256 257 260
 261 262 264 265 266 269 270 271 272 274 275 276 279 282 283 284 286 287
 288 289 290 292 293 294 296 298 300]


### Zeros and Ones

In [180]:
# Create an array of zeros
zeros_arr = np.zeros((3, 3))
print("Zeros array:\n", zeros_arr)

# Create an array of ones
ones_arr = np.ones((2, 4))
print("Ones array:\n", ones_arr)

Zeros array:
 [[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Ones array:
 [[1. 1. 1. 1.]
 [1. 1. 1. 1.]]


### Finding the location of a value in an array

In [182]:
# Find the index of the value 30
index = np.where(arr == 30)
print("Index of value 30:", index)

Index of value 30: (array([176], dtype=int64),)


In [183]:
# Find the index of the value 30
index = np.where(arr == 1)
print("Index of value 30:", index)

Index of value 30: (array([], dtype=int64),)
