# Data Transformation and Cleansing

#### Usual library imports

In [8]:
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm

### Import diamonds dataset

In [9]:
diamonds = sns.load_dataset("diamonds")

In [10]:
mtcars = sm.datasets.get_rdataset('mtcars').data

In [11]:
mtcars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


### Selecting Rows

In [16]:
diamonds.query('cut == "Good" \
and color == "E" and clarity == "VVS2" \
and price > 10000')

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
24630,1.3,Good,E,VVS2,62.8,59.0,12967,6.95,6.99,4.38
25828,1.41,Good,E,VVS2,59.9,61.0,14853,7.21,7.37,4.37
27177,1.5,Good,E,VVS2,64.3,58.0,17449,7.2,7.13,4.61


In [21]:
diamonds[(diamonds['cut'] == 'Good') 
       & (diamonds['color'] == 'E')
         & (diamonds['clarity'] == 'VVS2')
           & (diamonds['price'] >10000)]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
24630,1.3,Good,E,VVS2,62.8,59.0,12967,6.95,6.99,4.38
25828,1.41,Good,E,VVS2,59.9,61.0,14853,7.21,7.37,4.37
27177,1.5,Good,E,VVS2,64.3,58.0,17449,7.2,7.13,4.61


In [22]:
diamonds[(diamonds['cut'] == 'Good') & (diamonds['clarity'].str.startswith('V'))]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
35,0.23,Good,F,VS1,58.2,59.0,402,4.06,4.08,2.37
36,0.23,Good,E,VS1,64.1,59.0,402,3.83,3.85,2.46
42,0.26,Good,D,VS2,65.2,56.0,403,3.99,4.02,2.61
43,0.26,Good,D,VS1,58.4,63.0,403,4.19,4.24,2.46
...,...,...,...,...,...,...,...,...,...,...
53840,0.71,Good,H,VVS2,60.4,63.0,2738,5.69,5.74,3.45
53886,0.70,Good,D,VS2,58.0,62.0,2749,5.78,5.87,3.38
53895,0.70,Good,F,VS1,57.8,61.0,2751,5.83,5.79,3.36
53913,0.80,Good,G,VS2,64.2,58.0,2753,5.84,5.81,3.74


### Data Types

In [12]:
mtcars.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, Mazda RX4 to Volvo 142E
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mpg     32 non-null     float64
 1   cyl     32 non-null     int64  
 2   disp    32 non-null     float64
 3   hp      32 non-null     int64  
 4   drat    32 non-null     float64
 5   wt      32 non-null     float64
 6   qsec    32 non-null     float64
 7   vs      32 non-null     int64  
 8   am      32 non-null     int64  
 9   gear    32 non-null     int64  
 10  carb    32 non-null     int64  
dtypes: float64(5), int64(6)
memory usage: 3.0+ KB


In [13]:
mtcars.cyl.value_counts()

8    14
4    11
6     7
Name: cyl, dtype: int64

In [14]:
mtcars['cyl'] = mtcars['cyl'].astype('category')

In [15]:
mtcars.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, Mazda RX4 to Volvo 142E
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   mpg     32 non-null     float64 
 1   cyl     32 non-null     category
 2   disp    32 non-null     float64 
 3   hp      32 non-null     int64   
 4   drat    32 non-null     float64 
 5   wt      32 non-null     float64 
 6   qsec    32 non-null     float64 
 7   vs      32 non-null     int64   
 8   am      32 non-null     int64   
 9   gear    32 non-null     int64   
 10  carb    32 non-null     int64   
dtypes: category(1), float64(5), int64(5)
memory usage: 2.9+ KB


### Add Columns

In [16]:
diamonds['norm_carat'] = (diamonds['carat'] - diamonds['carat'].mean() )/diamonds['carat'].std()
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,norm_carat
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,-1.198157
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,-1.24035
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,-1.198157
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,-1.071577
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,-1.029384


### Delete Columns

In [17]:
del diamonds['norm_carat']

In [None]:
diamonds.drop('norm_carat', axis=1, inplace=True)

In [18]:
diamonds.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

### Reordering Columns

In [19]:
diamonds = diamonds[['price', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',  'x', 'y','z']]

In [20]:
diamonds

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z
0,326,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,326,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,327,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,334,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,335,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,2757,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50
53936,2757,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61
53937,2757,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56
53938,2757,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74


### Sorting Values

In [21]:
diamonds.sort_values(['price', 'carat'], ascending=[True, False], inplace = True)

In [22]:
diamonds

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z
0,326,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,326,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,327,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,334,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,335,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
27745,18803,2.00,Very Good,H,SI1,62.8,57.0,7.95,8.00,5.01
27746,18804,2.07,Ideal,G,SI2,62.5,55.0,8.20,8.13,5.11
27747,18806,1.51,Ideal,G,IF,61.7,55.0,7.37,7.41,4.56
27748,18818,2.00,Very Good,G,SI1,63.5,56.0,7.90,7.97,5.04


### Renaming Columns

In [23]:
diamonds.rename(columns = {'price': 'dollars', 'carat': 'weight'})

Unnamed: 0,dollars,weight,cut,color,clarity,depth,table,x,y,z
0,326,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,326,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,327,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,334,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,335,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
27745,18803,2.00,Very Good,H,SI1,62.8,57.0,7.95,8.00,5.01
27746,18804,2.07,Ideal,G,SI2,62.5,55.0,8.20,8.13,5.11
27747,18806,1.51,Ideal,G,IF,61.7,55.0,7.37,7.41,4.56
27748,18818,2.00,Very Good,G,SI1,63.5,56.0,7.90,7.97,5.04


### Removing Duplicates

In [24]:
df = pd.DataFrame({'state': ['NY', 'NJ', 'NJ', 'NJ', 'CT', 'CT', 'CT'],
                  'variable1': [2, 3, 2, 2, 4, 6, 6],
                  'variable2': [10, 22, 22, 24, 11, 24, 24]})
df

Unnamed: 0,state,variable1,variable2
0,NY,2,10
1,NJ,3,22
2,NJ,2,22
3,NJ,2,24
4,CT,4,11
5,CT,6,24
6,CT,6,24


In [25]:
df.drop_duplicates()

Unnamed: 0,state,variable1,variable2
0,NY,2,10
1,NJ,3,22
2,NJ,2,22
3,NJ,2,24
4,CT,4,11
5,CT,6,24


In [26]:
df.drop_duplicates(['state', 'variable1'])

Unnamed: 0,state,variable1,variable2
0,NY,2,10
1,NJ,3,22
2,NJ,2,22
4,CT,4,11
5,CT,6,24


### Replacing Values

In [27]:
df.replace({'NJ': 'New Jersey', 'NY': 'New York', 'CT': 'Connecticut'})

Unnamed: 0,state,variable1,variable2
0,New York,2,10
1,New Jersey,3,22
2,New Jersey,2,22
3,New Jersey,2,24
4,Connecticut,4,11
5,Connecticut,6,24
6,Connecticut,6,24


### Binning

Consider `variable1`.  Say we want anything 2 or lower to be labeled as Small,
2 to 4 as Medium, and 4 to 6 as Large

In [28]:
df['NewColumn'] = pd.cut(df.variable1, [0, 2, 4, 6], labels = ["Small", "Medium", "Large"])
df

Unnamed: 0,state,variable1,variable2,NewColumn
0,NY,2,10,Small
1,NJ,3,22,Medium
2,NJ,2,22,Small
3,NJ,2,24,Small
4,CT,4,11,Medium
5,CT,6,24,Large
6,CT,6,24,Large


In [29]:
df

Unnamed: 0,state,variable1,variable2,NewColumn
0,NY,2,10,Small
1,NJ,3,22,Medium
2,NJ,2,22,Small
3,NJ,2,24,Small
4,CT,4,11,Medium
5,CT,6,24,Large
6,CT,6,24,Large


### Binning using Pandas cut function

In [30]:
pd.cut(df.variable1, [0, 2, 4, 6])

0    (0, 2]
1    (2, 4]
2    (0, 2]
3    (0, 2]
4    (2, 4]
5    (4, 6]
6    (4, 6]
Name: variable1, dtype: category
Categories (3, interval[int64]): [(0, 2] < (2, 4] < (4, 6]]

### Binning into Quantiles

In [31]:
pd.qcut(df.variable1, 2)

0    (1.999, 3.0]
1    (1.999, 3.0]
2    (1.999, 3.0]
3    (1.999, 3.0]
4      (3.0, 6.0]
5      (3.0, 6.0]
6      (3.0, 6.0]
Name: variable1, dtype: category
Categories (2, interval[float64]): [(1.999, 3.0] < (3.0, 6.0]]

### Select a random sample of data

In [32]:
diamonds.sample(3)

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z
17666,7120,0.94,Ideal,F,VS2,61.2,57.0,6.29,6.33,3.86
10357,4771,1.11,Ideal,I,SI1,62.3,58.0,6.6,6.65,4.13
43304,1401,0.45,Ideal,E,VVS2,61.9,56.0,4.95,4.91,3.05


### Joining Data with Merge

In [33]:
n = 5
df = pd.DataFrame(
    {'state': list(np.random.choice(["New York", "Florida", "California"], size=(n))), 
     'gender': list(np.random.choice(["Male", "Female"], size=(n), p=[.4, .6])),
     'housing': list(np.random.choice(["Rent", "Own"], size=(n))),     
     'height': list(np.random.randint(140,200,n))
     })

In [34]:
df

Unnamed: 0,state,gender,housing,height
0,New York,Male,Own,192
1,California,Female,Own,158
2,California,Female,Rent,175
3,California,Female,Own,189
4,California,Male,Rent,161


In [35]:
df_rent = pd.DataFrame(
          {'state': ["Connecticut", "Florida", "California"],
           'avg_rent': [3500, 2200, 4500]})

In [36]:
df_rent

Unnamed: 0,state,avg_rent
0,Connecticut,3500
1,Florida,2200
2,California,4500


#### Left Join

In [37]:
df.merge(df_rent, how = 'left' , 
         left_on = 'state', right_on = 'state')

Unnamed: 0,state,gender,housing,height,avg_rent
0,New York,Male,Own,192,
1,California,Female,Own,158,4500.0
2,California,Female,Rent,175,4500.0
3,California,Female,Own,189,4500.0
4,California,Male,Rent,161,4500.0


#### Right Join

In [38]:
df.merge(df_rent, how = 'right' , 
         left_on = 'state', right_on = 'state')

Unnamed: 0,state,gender,housing,height,avg_rent
0,Connecticut,,,,3500
1,Florida,,,,2200
2,California,Female,Own,158.0,4500
3,California,Female,Rent,175.0,4500
4,California,Female,Own,189.0,4500
5,California,Male,Rent,161.0,4500


#### Inner Join

In [39]:
df.merge(df_rent, how = 'inner' , 
         left_on = 'state', right_on = 'state')

Unnamed: 0,state,gender,housing,height,avg_rent
0,California,Female,Own,158,4500
1,California,Female,Rent,175,4500
2,California,Female,Own,189,4500
3,California,Male,Rent,161,4500


#### Outer Join

In [40]:
df.merge(df_rent, how = 'outer' , 
         left_on = 'state', right_on = 'state')

Unnamed: 0,state,gender,housing,height,avg_rent
0,New York,Male,Own,192.0,
1,California,Female,Own,158.0,4500.0
2,California,Female,Rent,175.0,4500.0
3,California,Female,Own,189.0,4500.0
4,California,Male,Rent,161.0,4500.0
5,Connecticut,,,,3500.0
6,Florida,,,,2200.0


### Concatenation

In [41]:
pd.concat([df_rent, df], axis = 1)

Unnamed: 0,state,avg_rent,state.1,gender,housing,height
0,Connecticut,3500.0,New York,Male,Own,192
1,Florida,2200.0,California,Female,Own,158
2,California,4500.0,California,Female,Rent,175
3,,,California,Female,Own,189
4,,,California,Male,Rent,161


In [42]:
pd.concat([df_rent, df], axis = 0)

Unnamed: 0,state,avg_rent,gender,housing,height
0,Connecticut,3500.0,,,
1,Florida,2200.0,,,
2,California,4500.0,,,
0,New York,,Male,Own,192.0
1,California,,Female,Own,158.0
2,California,,Female,Rent,175.0
3,California,,Female,Own,189.0
4,California,,Male,Rent,161.0


## Dealing with Missing Values

In [5]:
np.random.seed(1)
n = 5
df = pd.DataFrame(
    {'state': list(np.random.choice(["New York", "Florida", "California"], size=(n))), 
     'gender': list(np.random.choice(["Male", "Female"], size=(n), p=[.4, .6])),
     'housing': list(np.random.choice(["Rent", "Own"], size=(n))),     
     'height': list(np.random.randint(140,200,n))
     })

In [6]:
for row in range(df.shape[0]):
    for col in range(df.shape[1]):
        if np.random.uniform() < 0.25:
            df.iloc[row,col] = np.nan

In [45]:
df

Unnamed: 0,state,gender,housing,height
0,Florida,Male,Rent,160.0
1,New York,Male,Rent,151.0
2,,Male,Rent,182.0
3,Florida,Male,,168.0
4,Florida,Male,Rent,


### Check for missing values

In [7]:
# Count missing values - by columns

df.isna().sum(axis=0)


state      1
gender     0
housing    1
height     1
dtype: int64

In [49]:
# Count missing values - by rows

df.isna().sum(axis=1)


0    0
1    0
2    1
3    1
4    1
dtype: int64

In [50]:
# Count missing values - by columns, sorted

df.isna().sum(axis=0).sort_values(ascending=False)


state      1
housing    1
height     1
gender     0
dtype: int64

In [51]:
df.isna().sum(axis=1).sort_values(ascending=False)

2    1
3    1
4    1
0    0
1    0
dtype: int64

#### Drop Missing Values

##### Drop rows with missing values

In [65]:
df

Unnamed: 0,state,gender,housing,height
0,Florida,Male,Rent,160
1,New York,Male,Rent,151
2,,Male,Rent,182
3,Florida,Male,,168
4,Florida,Male,Rent,169


In [64]:
df.dropna()

Unnamed: 0,state,gender,housing,height
0,Florida,Male,Rent,160
1,New York,Male,Rent,151
4,Florida,Male,Rent,169


##### Drop columns with missing values

In [66]:
df.dropna(axis = 1)

Unnamed: 0,gender,height
0,Male,160
1,Male,151
2,Male,182
3,Male,168
4,Male,169


In [75]:
df


Unnamed: 0,state,gender,housing,height
0,Florida,Male,Rent,160
1,New York,Male,Rent,151
2,,Male,Rent,182
3,Florida,Male,,168
4,Florida,Male,Rent,169


#### Fill Missing Data

In [77]:
df.fillna('Connecticut')

Unnamed: 0,state,gender,housing,height
0,Florida,Male,Rent,160
1,New York,Male,Rent,151
2,Connecticut,Male,Rent,182
3,Florida,Male,Connecticut,168
4,Florida,Male,Rent,169


In [80]:
df['state'].fillna('Connecticut', inplace = True)
df

Unnamed: 0,state,gender,housing,height
0,Florida,Male,Rent,160
1,New York,Male,Rent,151
2,Connecticut,Male,Rent,182
3,Florida,Male,,168
4,Florida,Male,Rent,169


### Forward and Backward Fill

In [106]:
# Let us make some of the height numbers NaN
df.loc[[0,3], 'height'] = np.nan

In [107]:
df

Unnamed: 0,state,gender,housing,height
0,Florida,Male,Rent,
1,New York,Male,Rent,151.0
2,,Male,Rent,182.0
3,Florida,Male,,
4,Florida,Male,Rent,


In [111]:
df.fillna(method = 'ffill', inplace = True)
df

Unnamed: 0,state,gender,housing,height
0,Florida,Male,Rent,
1,New York,Male,Rent,151.0
2,New York,Male,Rent,182.0
3,Florida,Male,Rent,182.0
4,Florida,Male,Rent,182.0


In [112]:
df.fillna(method = 'bfill')

Unnamed: 0,state,gender,housing,height
0,Florida,Male,Rent,151.0
1,New York,Male,Rent,151.0
2,New York,Male,Rent,182.0
3,Florida,Male,Rent,182.0
4,Florida,Male,Rent,182.0


***
# Imputation using sklearn


In [113]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)


In [129]:
import statsmodels.api as sm
df = sm.datasets.get_rdataset('mtcars').data

In [130]:
df

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [131]:
for row in range(df.shape[0]):
    for col in range(df.shape[1]):
        if np.random.uniform() < 0.25:
            df.iloc[row,col] = np.nan

In [134]:
df

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,,160.0,110.0,3.9,2.62,16.46,,,,4.0
Mazda RX4 Wag,21.0,6.0,,110.0,3.9,2.875,,0.0,,4.0,4.0
Datsun 710,22.8,4.0,108.0,,3.85,2.32,18.61,1.0,1.0,4.0,1.0
Hornet 4 Drive,,6.0,258.0,110.0,3.08,3.215,19.44,1.0,0.0,3.0,1.0
Hornet Sportabout,18.7,8.0,360.0,,3.15,3.44,17.02,0.0,,3.0,2.0
Valiant,18.1,,225.0,105.0,2.76,3.46,20.22,,0.0,3.0,1.0
Duster 360,14.3,8.0,360.0,245.0,3.21,3.57,15.84,0.0,0.0,3.0,4.0
Merc 240D,24.4,4.0,146.7,62.0,3.69,3.19,20.0,,0.0,,
Merc 230,22.8,4.0,140.8,95.0,,3.15,22.9,1.0,0.0,4.0,2.0
Merc 280,19.2,6.0,167.6,,3.92,3.44,18.3,,0.0,,4.0


In [136]:
pd.DataFrame(imp.fit_transform(df), columns = df.columns, index = df.index).round(1)



Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,5.2,160.0,110.0,3.9,2.6,16.5,0.7,0.9,4.5,4.0
Mazda RX4 Wag,21.0,6.0,164.7,110.0,3.9,2.9,16.8,0.0,0.7,4.0,4.0
Datsun 710,22.8,4.0,108.0,90.5,3.8,2.3,18.6,1.0,1.0,4.0,1.0
Hornet 4 Drive,20.5,6.0,258.0,110.0,3.1,3.2,19.4,1.0,0.0,3.0,1.0
Hornet Sportabout,18.7,8.0,360.0,190.1,3.2,3.4,17.0,0.0,0.2,3.0,2.0
Valiant,18.1,5.7,225.0,105.0,2.8,3.5,20.2,0.6,0.0,3.0,1.0
Duster 360,14.3,8.0,360.0,245.0,3.2,3.6,15.8,0.0,0.0,3.0,4.0
Merc 240D,24.4,4.0,146.7,62.0,3.7,3.2,20.0,0.9,0.0,4.2,2.8
Merc 230,22.8,4.0,140.8,95.0,3.8,3.2,22.9,1.0,0.0,4.0,2.0
Merc 280,19.2,6.0,167.6,114.8,3.9,3.4,18.3,0.6,0.0,3.9,4.0


In [137]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2, weights="uniform")
pd.DataFrame(imputer.fit_transform(df), columns = df.columns, index = df.index)

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6.0,160.0,110.0,3.9,2.62,16.46,0.5,0.5,4.0,4.0
Mazda RX4 Wag,21.0,6.0,163.8,110.0,3.9,2.875,17.38,0.0,0.5,4.0,4.0
Datsun 710,22.8,4.0,108.0,111.5,3.85,2.32,18.61,1.0,1.0,4.0,1.0
Hornet 4 Drive,25.7,6.0,258.0,110.0,3.08,3.215,19.44,1.0,0.0,3.0,1.0
Hornet Sportabout,18.7,8.0,360.0,130.0,3.15,3.44,17.02,0.0,0.0,3.0,2.0
Valiant,18.1,5.0,225.0,105.0,2.76,3.46,20.22,0.5,0.0,3.0,1.0
Duster 360,14.3,8.0,360.0,245.0,3.21,3.57,15.84,0.0,0.0,3.0,4.0
Merc 240D,24.4,4.0,146.7,62.0,3.69,3.19,20.0,1.0,0.0,4.0,2.5
Merc 230,22.8,4.0,140.8,95.0,3.835,3.15,22.9,1.0,0.0,4.0,2.0
Merc 280,19.2,6.0,167.6,130.0,3.92,3.44,18.3,0.5,0.0,4.0,4.0
