In [None]:
import pandas as pd
data = pd.read_csv('vehicles/vehicles_messy.csv')

In [None]:
data.head()

In [None]:
#handle nulls - identify
#data.isnull()

null_cols = data.isnull().sum()
#null_cols
null_cols[null_cols > 0]

In [None]:
#handle nulls - drop columns with too many nulls
#list(null_cols[null_cols > 10000].index)
drop_cols = list(null_cols[null_cols > 10000].index)
data = data.drop(drop_cols, axis=1)

In [None]:
data.shape

In [None]:
#handle nulls - fillna()
null_displ = data[(data['displ'].isnull()==True)]
null_displ = null_displ[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]
null_displ.head(60)

In [None]:
data[['displ', 'cylinders']] = data[['displ', 'cylinders']].fillna(0)

In [None]:
#check for internal inconsistencies
test = data[(data['cylinders']==0) & (data['displ']!=0)]
test[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]


In [None]:
data.loc[(data['cylinders']==0) & (data['displ']!=0), 'cylinders'] = 4


In [None]:
#find low-variance columns
import numpy as np
low_variance = []

for col in list(data._get_numeric_data()):
    minimum = min(data[col])
    ninety_perc = np.percentile(data[col], 90)
    if ninety_perc == minimum:
        low_variance.append(col)

print(low_variance)


In [None]:
data = data.drop(low_variance, axis=1)


In [None]:
data.shape

In [None]:
#check for outliers
data.describe().transpose()
stats = data.describe().transpose()
stats['IQR'] = stats['75%'] - stats['25%']
stats

In [None]:
stats['comb08','IQR']

In [None]:
outliers = pd.DataFrame(columns=data.columns)
#outliers
for col in stats.index:
    iqr = stats.at[col,'IQR']
    cutoff = iqr * 1.5
    lower = stats.at[col,'25%'] - cutoff
    upper = stats.at[col,'75%'] + cutoff
    results = data[(data[col] < lower) | 
                   (data[col] > upper)].copy()
    results['Outlier'] = col
    outliers = outliers.append(results)

outliers

In [None]:
#check datatypes
data.dtypes

In [None]:
data['year'] = data['year'].astype('object')
data['year'].dtype
data.dtypes

In [None]:
# harmonize values
print(set(data['trany']))

In [None]:
data['trany'] = data['trany'].str.replace('-', '')
print(set(data['trany']))

In [None]:
data['trany'] = data['trany'].str.replace('Automatic', 'Auto')
data['trany'] = data['trany'].str.replace('Auto\(', 'Auto ')
data['trany'] = data['trany'].str.replace('Manual\(', 'Manual ')
data['trany'] = data['trany'].str.replace('\(', '')
data['trany'] = data['trany'].str.replace('\)', '')
print(set(data['trany']))

In [None]:
#drop duplicates
before = len(data)
data = data.drop_duplicates()
after = len(data)
print('Number of duplicate records dropped: ', str(before - after))
data.shape

In [None]:
#select relevant rows, then remove duplicates
before = len(data)
select_columns = ['make', 'model', 'year', 'displ', 'cylinders', 
                  'trany', 'drive', 'VClass','fuelType','barrels08', 
                  'city08', 'highway08', 'comb08', 'co2TailpipeGpm', 'fuelCost08']

data = data[select_columns].drop_duplicates()
after = len(data)
print('Number of duplicate records dropped: ', str(before - after))
data.shape