In [17]:
import pandas as pd

# Exercise 02. Preprocessing

## Task 1. Download and read the CSV file, making ID the index column.

In [18]:
df = pd.read_csv(
    '../data/auto.csv',
    sep=',',
    engine='python',
    index_col='ID'
)
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
4,E34877152RUS,Ford Focus,2.0,6100.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


## Task 2. Count the number of observations using the method count()

In [19]:
df.count()

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

## Task 3. Drop the duplicates, taking into account only the following columns: CarNumber, Make_n_Model, and Fines

In [20]:
df = df.drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'], keep='last')
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
5,92918M178RUS,Ford Focus,1.0,5700.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


## Task 4. Work with missing values

In [21]:
df.isnull().sum()

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

In [22]:
df = df.dropna(axis=1, thresh=len(df)-500)
df.isnull().sum()

CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

In [23]:
df['Refund'] = df['Refund'].fillna(method='ffill')
df.isnull().sum()

  df['Refund'] = df['Refund'].fillna(method='ffill')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Refund'] = df['Refund'].fillna(method='ffill')


CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

In [24]:
avg_fines = df.Fines.mean()
df.Fines = df.Fines.fillna(avg_fines)
df.Fines
df.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Fines = df.Fines.fillna(avg_fines)


CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

## Task 5. Split and parse the make and model.

In [25]:
df['Make'] = df.apply(lambda x: x['Make_n_model'].split()[0], axis=1)
df['Model'] = df.apply(lambda x: x['Make_n_model'].split()[1] if len(x['Make_n_model'].split()) > 1 else pd.NA, axis=1)
df = df.drop('Make_n_model', axis=1)
df

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,2.0,2000.000000,Ford,Focus
5,92918M178RUS,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...
926,Y163O8161RUS,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,2.0,600.000000,Ford,Focus
929,8610T8154RUS,1.0,2000.000000,Ford,Focus


In [26]:
df.to_json('../data/auto.json', orient='records', indent=2)

In [30]:
df2 = pd.read_json('../data/auto.json', orient='records')
df2['Refund'].mean()

np.float64(1.5172413793103448)