# 3. Importing and Exporting Data

## Exercise 15 - Weird taxi rides

In [130]:
# load in CSV, specify select columns
taxi = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\nyc_taxi_2019-01.csv',
                  usecols=['passenger_count', 'trip_distance', 'total_amount', 'payment_type'])
taxi.head()

Unnamed: 0,passenger_count,trip_distance,payment_type,total_amount
0,1,1.5,1,9.95
1,1,2.6,1,16.3
2,3,0.0,1,5.8
3,5,0.0,2,7.55
4,5,0.0,2,55.55


In [131]:
# how many rides > 8 passengers?
taxi.loc[taxi['passenger_count'] > 8, 'passenger_count'].count()

9

In [132]:
# how many rides w/ 0 passengers?
taxi.loc[taxi['passenger_count']==0, 'passenger_count'].count()

117381

In [133]:
#  how many paid in cash that cost over 1,000?
taxi.loc[(taxi['payment_type']==2) & (taxi['total_amount'] > 1000), 'passenger_count'].count()

5

In [134]:
# how many cost negative dollars (potential refunds)
taxi.loc[taxi['total_amount'] < 0, 'passenger_count'].count()

7131

In [135]:
# how many below avg distance, but cost more than avg
taxi.loc[(taxi['trip_distance'] < taxi['trip_distance'].mean()) & (taxi['total_amount'] > taxi['total_amount'].mean()), 'passenger_count'].count()

411255

### Exercise 15b

In [137]:
# repeat exercises using query instead of loc
print(taxi.query('passenger_count>8')['passenger_count'].count())

print(taxi.query('passenger_count==0')['passenger_count'].count())

print(taxi.query('payment_type==2 & total_amount > 1000')['passenger_count'].count())

print(taxi.query('total_amount < 0')['passenger_count'].count())

print(taxi.query('trip_distance < trip_distance.mean() & total_amount > total_amount.mean()')['passenger_count'].count())

9
117381
5
7131
411255


In [138]:
# amount < 0 with payment type 4 or 6
taxi.loc[(taxi['total_amount'] < 0) & (taxi['payment_type'].isin([4,6])), 'passenger_count'].count()

2666

In [139]:
# normalized percents for payment types 1 and 2 only
taxi['payment_type'].value_counts(normalize=True)[[1,2]]

payment_type
1   0.72
2   0.28
Name: proportion, dtype: float64

## Exercise 16 - Pandemic taxis

In [141]:
# read in CSV, specific columns, specify datatypes that don't result in data loss
july19 = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\nyc_taxi_2019-07.csv',
                     usecols=['passenger_count', 'total_amount', 'payment_type'],
                     dtype = {'passenger_count': np.float32,
                              'total_amount': np.float32,
                              'payment_type': np.float32})

# add year column
july19['year'] = 2019

july19.head()

Unnamed: 0,passenger_count,payment_type,total_amount,year
0,1.0,1.0,4.94,2019
1,1.0,2.0,20.3,2019
2,1.0,1.0,70.67,2019
3,1.0,1.0,66.36,2019
4,0.0,1.0,15.3,2019


In [142]:
# Read in CSV, specify columns
july20 = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\nyc_taxi_2020-07.csv',
                  usecols=['passenger_count', 'total_amount', 'payment_type'])

# add year column
july20['year'] = 2020

july20.head()

Unnamed: 0,passenger_count,payment_type,total_amount,year
0,1.0,2.0,9.3,2020
1,1.0,1.0,27.8,2020
2,1.0,2.0,22.3,2020
3,1.0,1.0,14.16,2020
4,1.0,2.0,7.8,2020


In [143]:
# compare total ride count
# size captures too many values as it is rows * columns, must use len()
print(len(july19))
print(len(july20))

6310419
800412


In [144]:
# compare proportion
print(len(july19)/len(july20))

7.883963508792972


In [145]:
# compare absolute different
len(july19) - len(july20)

5510007

In [146]:
# compare total money collected
print(july19['total_amount'].sum())
print(july20['total_amount'].sum())

# compare difference between years
july19['total_amount'].sum() - july20['total_amount'].sum()

123761816.0
14912844.090000005


108848971.91

In [147]:
# difference in proportion of rides with more than 1 passenger
print(july19.loc[july19['passenger_count'] > 1, 'passenger_count'].count() / july19['passenger_count'].count())
print(july20.loc[july20['passenger_count'] > 1, 'passenger_count'].count() / july20['passenger_count'].count())

0.2833900000955953
0.2061513222563435


In [148]:
# did people use cash less? (payment type 2)
print(july19['payment_type'].value_counts(normalize=True)[2])
print(july20['payment_type'].value_counts(normalize=True)[2])

0.2870595845428793
0.320558865998251


### Exercise 16b

In [150]:
# numeric correlations
joined = pd.concat([july19, july20])
joined.corr()

Unnamed: 0,passenger_count,payment_type,total_amount,year
passenger_count,1.0,0.02,0.01,-0.05
payment_type,0.02,1.0,-0.14,0.03
total_amount,0.01,-0.14,1.0,-0.02
year,-0.05,0.03,-0.02,1.0


In [151]:
# difference in total amount descriptive stats, round to 2 decimals
(joined.loc[joined['year']==2020, 'total_amount'].describe() - joined.loc[joined['year']==2019, 'total_amount'].describe()).round(2)

count   -5,510,007.00
mean            -0.98
std             -0.75
min             53.20
25%             -0.50
50%             -0.60
75%             -0.75
max         -4,672.45
Name: total_amount, dtype: float64

In [152]:
# difference in proportion of 0 passenger trips (likely deliveries)
print(joined.loc[joined['year']==2019, 'passenger_count'].value_counts(normalize=True)[0])
print(joined.loc[joined['year']==2020, 'passenger_count'].value_counts(normalize=True)[0])

0.018622599363335383
0.026446482682882185


## Exercise 17 - Setting column types

In [154]:
# read large CSV, specific columns, chunk 1000 rows at a time
jan20 = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\nyc_taxi_2020-01.csv',
                    usecols=['passenger_count', 'total_amount', 'payment_type'],
                    chunksize=1000)

In [155]:
# read chunk to get a general idea of the data
chunk = jan20.get_chunk()
chunk

Unnamed: 0,passenger_count,payment_type,total_amount
0,1,1,11.27
1,1,1,12.30
2,1,1,10.80
3,1,1,8.16
4,1,2,4.80
...,...,...,...
995,2,1,11.00
996,2,2,29.30
997,2,1,13.80
998,1,1,12.35


In [156]:
# efficiently read in full CSV, specific columns, specify initial data types
jan20 = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\nyc_taxi_2020-01.csv',
                    usecols=['passenger_count', 'total_amount', 'payment_type'],
                    dtype = {'passenger_count': np.float32,
                             'payment_type': np.float32,
                             'total_amount': np.float32})
jan20

Unnamed: 0,passenger_count,payment_type,total_amount
0,1.00,1.00,11.27
1,1.00,1.00,12.30
2,1.00,1.00,10.80
3,1.00,1.00,8.16
4,1.00,2.00,4.80
...,...,...,...
6405003,,,21.14
6405004,,,62.46
6405005,,,51.90
6405006,,,30.22


In [157]:
# view info
jan20.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6405008 entries, 0 to 6405007
Data columns (total 3 columns):
 #   Column           Dtype  
---  ------           -----  
 0   passenger_count  float32
 1   payment_type     float32
 2   total_amount     float32
dtypes: float32(3)
memory usage: 73.3 MB


In [158]:
# count of NA rows
jan20.isna().sum()

passenger_count    65441
payment_type       65441
total_amount           0
dtype: int64

In [159]:
# remove rows with missing values
jan20 = jan20.dropna().copy()

# new count of NA rows
jan20.isna().sum()

passenger_count    0
payment_type       0
total_amount       0
dtype: int64

In [160]:
# view dtypes and describe
print(jan20.dtypes)
jan20.describe()

passenger_count    float32
payment_type       float32
total_amount       float32
dtype: object


Unnamed: 0,passenger_count,payment_type,total_amount
count,6339567.0,6339567.0,6339567.0
mean,1.52,1.27,18.47
std,1.17,0.48,14.53
min,0.0,1.0,-1242.3
25%,1.0,1.0,11.16
50%,1.0,1.0,14.16
75%,2.0,2.0,19.56
max,9.0,5.0,4268.3


In [161]:
# set to smallest dtype possible while ensuring data is not lost

# assertions are one way to manually to ensure the new data matches the old
assert (jan20['passenger_count'] == jan20['passenger_count'].astype(np.int8)).all()
assert (jan20['payment_type'] == jan20['payment_type'].astype(np.int8)).all()
assert (jan20['total_amount'] == jan20['total_amount'].astype(np.float32)).all()

In [162]:
# instead, I will use to_numeric to automatically downcast to smallest dtype
jan20['passenger_count'] = pd.to_numeric(jan20['passenger_count'], downcast='integer')
jan20['payment_type'] = pd.to_numeric(jan20['payment_type'], downcast='integer')
jan20['total_amount'] = pd.to_numeric(jan20['total_amount'], downcast='float')

In [163]:
# view updated dtypes
jan20.dtypes

passenger_count       int8
payment_type          int8
total_amount       float32
dtype: object

In [164]:
# final updated df
jan20

Unnamed: 0,passenger_count,payment_type,total_amount
0,1,1,11.27
1,1,1,12.30
2,1,1,10.80
3,1,1,8.16
4,1,2,4.80
...,...,...,...
6339562,1,1,17.76
6339563,1,1,20.16
6339564,1,1,19.56
6339565,1,2,12.30


### Exercise 17b

In [166]:
# can we find appropriate dtypes for 4 more columns or more cleaning needs to be done?
morecols = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\nyc_taxi_2020-01.csv',
                    usecols=['VendorID', 'trip_distance', 'tip_amount', 'total_amount'],
                    chunksize=1000)

In [167]:
chunk = morecols.get_chunk()
chunk

Unnamed: 0,VendorID,trip_distance,tip_amount,total_amount
0,1,1.20,1.47,11.27
1,1,1.20,1.50,12.30
2,1,0.60,1.00,10.80
3,1,0.80,1.36,8.16
4,2,0.00,0.00,4.80
...,...,...,...,...
995,2,0.62,2.20,11.00
996,2,7.09,0.00,29.30
997,2,2.59,0.00,13.80
998,1,0.80,2.05,12.35


In [168]:
morecols = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\nyc_taxi_2020-01.csv',
                    usecols=['VendorID', 'trip_distance', 'tip_amount', 'total_amount'],
                    dtype={'VendorID': np.float32,
                           'trip_distance': np.float32,
                           'tip_amount': np.float32,
                           'total_amount': np.float32})
morecols

Unnamed: 0,VendorID,trip_distance,tip_amount,total_amount
0,1.00,1.20,1.47,11.27
1,1.00,1.20,1.50,12.30
2,1.00,0.60,1.00,10.80
3,1.00,0.80,1.36,8.16
4,2.00,0.00,0.00,4.80
...,...,...,...,...
6405003,,3.24,0.00,21.14
6405004,,22.13,0.00,62.46
6405005,,10.51,0.00,51.90
6405006,,5.49,0.00,30.22


In [169]:
morecols.isna().any()

# VendorID contains NA values, must be of dtype float, other ones we can probably work with

VendorID          True
trip_distance    False
tip_amount       False
total_amount     False
dtype: bool

In [170]:
morecols.describe()

Unnamed: 0,VendorID,trip_distance,tip_amount,total_amount
count,6339567.0,6405008.0,6405008.0,6405008.0
mean,1.67,2.93,2.19,18.66
std,0.48,83.16,2.72,14.65
min,1.0,-30.62,-91.0,-1242.3
25%,1.0,0.96,0.0,11.16
50%,2.0,1.6,1.95,14.3
75%,2.0,2.93,2.86,19.8
max,2.0,210240.06,1100.0,4268.3


In [171]:
# set NA values in VendorID to 3, allows to be converted to int now
morecols.loc[morecols['VendorID'].isna(), 'VendorID'] = 3
morecols['VendorID'].value_counts()

VendorID
2.00    4245128
1.00    2094439
3.00      65441
Name: count, dtype: int64

In [172]:
# compare float16 vs float64 memory usage
jan20[['passenger_count', 'payment_type', 'total_amount']] = jan20.loc[:,['passenger_count', 'payment_type', 'total_amount']].astype(np.float16)
small_float = jan20.memory_usage()

jan20[['passenger_count', 'payment_type', 'total_amount']] = jan20.loc[:,['passenger_count', 'payment_type', 'total_amount']].astype(np.float64)
big_float = jan20.memory_usage()

print(small_float.sum())
print(big_float.sum())

88753938
202866144


In [173]:
# memory saved
big_float.sum() - small_float.sum()

114112206

### Notes
- use .copy() when calling dropna() to ensure changes stick
- always use .loc when updating values, but cannot be used when changing data types

## Exercise 18 - passwd to df

In [176]:
# read in CSV, line separator ':', comment indicator '#', rename columns, set username column as index
passwd = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\linux-etc-passwd.txt',
                    sep=':',
                    comment='#',
                    names=['username', 'password', 'userid', 'groupid', 'name', 'homedir', 'shell'],
                    index_col='username')
passwd.head()

Unnamed: 0_level_0,password,userid,groupid,name,homedir,shell
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
root,x,0,0,root,/root,/bin/bash
daemon,x,1,1,daemon,/usr/sbin,/usr/sbin/nologin
bin,x,2,2,bin,/bin,/usr/sbin/nologin
sys,x,3,3,sys,/dev,/usr/sbin/nologin
sync,x,4,65534,sync,/bin,/bin/sync


### Exercise 18b

In [178]:
# ignore password and groupid columns
passwd = pd.read_csv(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\linux-etc-passwd.txt',
                    sep=':',
                    comment='#',
                    names=['username', 'password', 'userid', 'groupid', 'name', 'homedir', 'shell'],
                    usecols=['username', 'userid', 'name', 'homedir', 'shell'],
                    index_col='username')
passwd.head()

Unnamed: 0_level_0,userid,name,homedir,shell
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
root,0,root,/root,/bin/bash
daemon,1,daemon,/usr/sbin,/usr/sbin/nologin
bin,2,bin,/bin,/usr/sbin/nologin
sys,3,sys,/dev,/usr/sbin/nologin
sync,4,sync,/bin,/bin/sync


In [179]:
# show nonspecial usernames (userid >=1000)
passwd.loc[passwd['userid']>=1000].index

Index(['nobody', 'user', 'reuven', 'genadi', 'shira', 'atara', 'shikma',
       'amotz', 'git', 'deploy'],
      dtype='object', name='username')

In [180]:
# what are the verious command interpreters or shells?
passwd['shell'].unique()

array(['/bin/bash', '/usr/sbin/nologin', '/bin/sync', '/bin/false',
       '/bin/sh', '/bin/nologin'], dtype=object)

### Notes
- drop_duplicates can be used to find unique values while keeping in series format
- with read_csv, 'names' requires the renaming of all columns. 'usecols' must thus always be a subset of 'names'. 

In [182]:
# can also use drop_duplicates to keep as series
passwd['shell'].drop_duplicates()

username
root                    /bin/bash
daemon          /usr/sbin/nologin
sync                    /bin/sync
syslog                 /bin/false
debian-spamd              /bin/sh
gitlab-redis         /bin/nologin
Name: shell, dtype: object

## Exercise 19 - Bitcoin values

In [184]:
# retrieve CSV from a webpage
bitcoin = pd.read_csv('https://api.blockchain.info/charts/market-price?format=csv',
                     header=None,
                     names=['date', 'closing_price'],
                     parse_dates=['date'])
bitcoin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           366 non-null    datetime64[ns]
 1   closing_price  366 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 5.8 KB


In [185]:
# closing price for most recent trading day
bitcoin.loc[bitcoin['date'].idxmax(), 'closing_price']

60565.91

In [186]:
# lowest historical price and the date
bitcoin.loc[bitcoin['closing_price'].idxmin(), :]

date             2023-09-27 00:00:00
closing_price              26,212.82
Name: 12, dtype: object

In [187]:
# highest historical price and date
bitcoin.loc[bitcoin['closing_price'].idxmax(), :]

date             2024-03-14 00:00:00
closing_price              73,094.37
Name: 181, dtype: object

### Exercise 19b

In [189]:
# retrieve current value with one line of code (without assigning to a variable)
pd.read_csv('https://api.blockchain.info/charts/market-price?format=csv',
                     header=None,
                     names=['date', 'closing_price'],
                     parse_dates=['date']).tail(1)['closing_price']

365   60,565.91
Name: closing_price, dtype: float64

In [190]:
# retrieve raw HTML
r = requests.get('https://finance.yahoo.com/quote/%5EGSPC/history?p=%5EGSPC',
                headers={'User-Agent': 'Mozilla/5.0'})

# access HTML in memory with StringIO
s = StringIO(r.content.decode())

# read in desired HTML from snp500
snp500 = pd.read_html(s)[0]

# set date as index
snp500 = snp500.set_index('Date')
snp500.head()

Unnamed: 0_level_0,Open,High,Low,Close Close price adjusted for splits.,Adj Close Adjusted close price adjusted for splits and dividend and/or capital gain distributions.,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Sep 13, 2024",5603.34,5636.27,5601.65,5626.02,5626.02,3500790000
"Sep 12, 2024",5557.48,5600.71,5535.5,5595.76,5595.76,3655070000
"Sep 11, 2024",5496.42,5560.41,5406.96,5554.13,5554.13,3839450000
"Sep 10, 2024",5490.51,5497.91,5441.72,5495.52,5495.52,3848180000
"Sep 9, 2024",5442.07,5484.2,5434.49,5471.05,5471.05,3825940000


In [191]:
# locate closing price and volume columns
snp500 = snp500.loc[:, ['Close Close price adjusted for splits.', 'Volume']]

# rename to make more manageable
snp500 = snp500.rename(columns={'Close Close price adjusted for splits.': 'Close', 'Volume': 'Volume'})
snp500.head()

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
"Sep 13, 2024",5626.02,3500790000
"Sep 12, 2024",5595.76,3655070000
"Sep 11, 2024",5554.13,3839450000
"Sep 10, 2024",5495.52,3848180000
"Sep 9, 2024",5471.05,3825940000


In [192]:
# date and volume of highest/lowest close values

high_low = snp500.loc[[snp500['Close'].idxmin(), snp500['Close'].idxmax()], :]
high_low

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
"Oct 27, 2023",4117.37,4019500000
"Jul 16, 2024",5667.2,4041760000


In [193]:
# print values to CSV
print(high_low.to_csv())

Date,Close,Volume
"Oct 27, 2023",4117.37,4019500000
"Jul 16, 2024",5667.2,4041760000



## Exercise 20 - Big cities

In [195]:
# read in JSON file
cities = pd.read_json(r'D:\Documents\Data Analysis\Datasets\pandas-workout-data\data\cities.json')
cities.head()

Unnamed: 0,city,growth_from_2000_to_2013,latitude,longitude,population,rank,state
0,New York,4.8%,40.71,-74.01,8405837,1,New York
1,Los Angeles,4.8%,34.05,-118.24,3884307,2,California
2,Chicago,-6.1%,41.88,-87.63,2718782,3,Illinois
3,Houston,11.0%,29.76,-95.37,2195914,4,Texas
4,Philadelphia,2.6%,39.95,-75.17,1553165,5,Pennsylvania


In [196]:
# mean and median populations
cities['population'].describe()[['mean', '50%']]

mean   131,132.44
50%     68,207.00
Name: population, dtype: float64

In [197]:
# how does this change without top 50?
cities.loc[51:, 'population'].describe()[['mean', '50%']]

# mean is much closer to the median now, but both decrease, as would be expected

mean   86,720.02
50%    65,690.00
Name: population, dtype: float64

In [198]:
# northernmost city (max latitude)
cities.loc[cities['latitude'].idxmax()]

city                        Anchorage
growth_from_2000_to_2013        15.4%
latitude                        61.22
longitude                     -149.90
population                     300950
rank                               63
state                          Alaska
Name: 62, dtype: object

In [199]:
# which state has most top cities?
cities['state'].value_counts().head()

state
California       212
Texas             83
Florida           73
Illinois          52
Massachusetts     36
Name: count, dtype: int64

In [200]:
# which state has the least on this list?
cities['state'].value_counts(ascending=True).head(10)

state
Vermont                 1
Alaska                  1
District of Columbia    1
Hawaii                  1
Maine                   1
Wyoming                 2
Delaware                2
South Dakota            2
West Virginia           2
New Hampshire           3
Name: count, dtype: int64

### Exercise 20b

In [202]:
# remove '%' from growth, replace with 0
cities['growth_from_2000_to_2013'] = cities['growth_from_2000_to_2013'].str.replace('%', '0')

# find additional empty strings, converting them to 0
cities.loc[cities['growth_from_2000_to_2013'] == '', 'growth_from_2000_to_2013'] = '0'

# convert to float and find mean/median
cities['growth_from_2000_to_2013'].astype(np.float64).describe()[['mean', '50%']]

mean   22.94
50%     9.65
Name: growth_from_2000_to_2013, dtype: float64

In [203]:
# compare count of negative vs positive growth

# cut into bins and label
growth = pd.cut(cities['growth_from_2000_to_2013'],
                bins=[cities['growth_from_2000_to_2013'].min(), 0, cities['growth_from_2000_to_2013'].max()],
                include_lowest=True,
                labels=['Neg Growth', 'Pos Growth'])
growth.value_counts()

growth_from_2000_to_2013
Pos Growth    852
Neg Growth    148
Name: count, dtype: int64

In [204]:
# find city/cities with latitudes more than 2 std away from mean

upper_lim = cities['latitude'].mean() + 2 * cities['latitude'].std()
lower_lim = cities['latitude'].mean() - 2 * cities['latitude'].std()

cities.loc[(cities['latitude'] < lower_lim) | (cities['latitude'] > upper_lim),'city'].values

array(['Miami', 'Honolulu', 'Anchorage', 'Hialeah', 'Brownsville',
       'Fort Lauderdale', 'Cape Coral', 'Pembroke Pines', 'Hollywood',
       'McAllen', 'Miramar', 'Coral Springs', 'Miami Gardens', 'Everett',
       'Pompano Beach', 'West Palm Beach', 'Davie', 'Miami Beach',
       'Plantation', 'Sunrise', 'Boca Raton', 'Bellingham', 'Mission',
       'Edinburg', 'Deerfield Beach', 'Pharr', 'Boynton Beach',
       'Lauderhill', 'Weston', 'Fort Myers', 'Harlingen', 'Homestead',
       'Delray Beach', 'Marysville', 'Tamarac', 'North Miami',
       'Wellington', 'Coconut Creek', 'Margate', 'Grand Forks', 'Doral',
       'Coral Gables', 'Bonita Springs', 'Minot', 'Cutler Bay',
       'Oakland Park', 'North Miami Beach', 'North Lauderdale',
       'Greenacres', 'Hallandale Beach', 'Aventura', 'Weslaco'],
      dtype=object)