# Datatype Transformations

The datatype transformation includes the following:

1. Viewing the datatype of columns and individual series
2. View unique items in a series
3. Setting specific values to "na"
4. Replacing values
5. Changing datatype

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('../00_Datasets/Toyota.csv')

## Analyzing Datatypes

In [2]:
# viewing the datatype of columns:

data.dtypes

Unnamed: 0      int64
Price           int64
Age           float64
KM             object
FuelType       object
HP             object
MetColor      float64
Automatic       int64
CC              int64
Doors          object
Weight          int64
dtype: object

In [3]:
# Viewing datatype counts:

data.dtypes.value_counts()

int64      5
object     4
float64    2
dtype: int64

In [9]:
# Viewing only int64 datatypes

data.select_dtypes(include=['int64'])

Unnamed: 0.1,Unnamed: 0,Price,Automatic,CC,Weight
0,0,13500,0,2000,1165
1,1,13750,0,2000,1165
2,2,13950,0,2000,1165
3,3,14950,0,2000,1165
4,4,13750,0,2000,1170
...,...,...,...,...,...
1431,1431,7500,0,1300,1025
1432,1432,10845,0,1300,1015
1433,1433,8500,0,1300,1015
1434,1434,7250,0,1300,1015


In [10]:
# Viewing all datatypes except object datatypes

data.select_dtypes(exclude=['object'])

Unnamed: 0.1,Unnamed: 0,Price,Age,MetColor,Automatic,CC,Weight
0,0,13500,23.0,1.0,0,2000,1165
1,1,13750,23.0,1.0,0,2000,1165
2,2,13950,24.0,,0,2000,1165
3,3,14950,26.0,0.0,0,2000,1165
4,4,13750,30.0,0.0,0,2000,1170
...,...,...,...,...,...,...,...
1431,1431,7500,,1.0,0,1300,1025
1432,1432,10845,72.0,0.0,0,1300,1015
1433,1433,8500,,0.0,0,1300,1015
1434,1434,7250,70.0,1.0,0,1300,1015


In [11]:
# View datatype of individual series:

data.Price.dtype

dtype('int64')

## View Unique Items in Series

In [32]:
# View unique items in KM

data.KM.unique()

array(['46986', '72937', '41711', ..., '30964', '20544', '17016'],
      dtype=object)

In [33]:
# View unique in another way:

np.unique(data.KM)

array(['1', '10000', '100123', ..., '99865', '99971', '??'], dtype=object)

In [34]:
# View unique items in FuelType:

data.FuelType.unique()

array(['Diesel', nan, 'Petrol', 'CNG'], dtype=object)

In [36]:
# Getting number of unique items (default excluded nan values):

data.FuelType.nunique()

3

## Importing File with Transformation

In [43]:
# Importing by setting specific values to "NaN" and Ignoring default index:

toyota = pd.read_csv('../00_Datasets/Toyota.csv', index_col=0, na_values=['??', '????'])
toyota.head(5)

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23.0,46986.0,Diesel,90.0,1.0,0,2000,three,1165
1,13750,23.0,72937.0,Diesel,90.0,1.0,0,2000,3,1165
2,13950,24.0,41711.0,Diesel,90.0,,0,2000,3,1165
3,14950,26.0,48000.0,Diesel,90.0,0.0,0,2000,3,1165
4,13750,30.0,38500.0,Diesel,90.0,0.0,0,2000,3,1170


## Replacing Values

In [44]:
# Viewing the unique values in Doors series:

toyota.Doors.unique()

array(['three', '3', '5', '4', 'four', 'five', '2'], dtype=object)

In [47]:
# Replacing values in Doors column:

toyota.Doors.replace("three", 3, inplace=True)
toyota.Doors.replace("four", 4, inplace=True)
toyota.Doors.replace("five", 5, inplace=True)

# Viewing the unique data present in Doors (after replace)

toyota.Doors.unique()

array([3, '3', '5', '4', 4, 5, '2'], dtype=object)

## Changing the Datatype

In [51]:
# Changing the Doors column datatype to Int64

toyota.Doors = toyota.Doors.astype('int64')

# Viewing the datatype of Doors

toyota.Doors.dtype

dtype('int64')

In [52]:
# Viewing unique items in Doors:

toyota.Doors.unique()

array([3, 5, 4, 2], dtype=int64)

In [66]:
# Changing the datatype of 'FuelType' columns:

toyota.FuelType = toyota.FuelType.astype('category')

# 'Automatic' and 'MetColour' only has 1 and 0 as value; so; converting them to category:

toyota.Automatic = toyota.Automatic.astype('category')
toyota.MetColor = toyota.MetColor.astype('category')

# Viewing the datatype; post datatype transformation:
toyota.dtypes

Price           int64
Age           float64
KM            float64
FuelType     category
HP            float64
MetColor     category
Automatic    category
CC              int64
Doors           int64
Weight          int64
dtype: object