In [89]:
import pandas as pd

## download and read the CSV file and make ID the index column

In [90]:
data = pd.DataFrame
try:
    data = pd.read_csv("../data/auto.csv", index_col='ID')
    print(data)
except IOError as e:
    print(e)
    quit(1)

        CarNumber    Make_n_model  Refund    Fines  History
ID                                                         
0    Y163O8161RUS      Ford Focus     2.0   3200.0      NaN
1     E432XX77RUS    Toyota Camry     1.0   6500.0      NaN
2     7184TT36RUS      Ford Focus     1.0   2100.0      NaN
3    X582HE161RUS      Ford Focus     2.0   2000.0      NaN
4    E34877152RUS      Ford Focus     2.0   6100.0      NaN
..            ...             ...     ...      ...      ...
926  Y163O8161RUS      Ford Focus     2.0   1600.0      NaN
927  M0309X197RUS      Ford Focus     1.0  22300.0      NaN
928  O673E8197RUS      Ford Focus     2.0    600.0      NaN
929  8610T8154RUS      Ford Focus     1.0   2000.0      NaN
930  H419XE197RUS  Toyota Corolla     2.0      NaN      2.0

[931 rows x 5 columns]


## count the number of observations using the method count()

In [91]:
data.count()

CarNumber       931
Make_n_model    931
Refund          914
Fines           869
History          82
dtype: int64

## drop the duplicates, taking into account only the following columns: CarNumber, Make_n_model, Fines

In [92]:
data.drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'], keep='last', inplace=True)
data.count()

CarNumber       725
Make_n_model    725
Refund          713
Fines           665
History          65
dtype: int64

## work with missing values

In [93]:
print("values are missing:")
data.isnull().sum()

values are missing:


CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

In [94]:
data.dropna(axis=1, thresh=(len(data.index) - 500), inplace=True)
print("values are missing:")
data.isnull().sum()

values are missing:


CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

In [95]:
data['Refund'].fillna(method='ffill', inplace=True)

In [96]:
print("values are missing:")
data.isnull().sum()

values are missing:


CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

In [97]:
data[data['Fines'].isnull()]

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11,E40577152RUS,Ford Focus,1.0,
14,X786CO96RUS,Ford Focus,1.0,
17,M592CH197RUS,Skoda Octavia,2.0,
20,7830C8197RUS,Ford Focus,2.0,
23,M298CH161RUS,Ford Focus,2.0,
26,E445TC197RUS,Ford Focus,1.0,
28,9371CE154RUS,Skoda Octavia,2.0,
32,83298C154RUS,Ford Focus,2.0,
33,Y7659C197RUS,Ford Focus,2.0,
37,7364C8197RUS,Ford Focus,2.0,


In [98]:
fm = data['Fines'].mean()
print(f"Fines mean value:{fm}")
data['Fines'].fillna(fm, inplace=True)
print("values are missing:")
data.isnull().sum()

Fines mean value:8594.586466165414
values are missing:


CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

## split and parse the make and model

In [99]:
data[["Make", "Model"]] = data["Make_n_model"].apply(lambda x: pd.Series(str(x).split(" ")))
data.drop(["Make_n_model"], axis=1, inplace=True)
data.to_json (r'../data/auto.json', orient='records')