In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

In [None]:
car_df = pd.read_csv('/kaggle/input/craigslist-carstrucks-data/vehicles.csv')
car_df.head()

In [None]:
car_df.shape, car_df.columns

Let's get info about how many nan's are in our columns

In [None]:
nan_sum_series = car_df.isna().sum().sort_values(ascending=False)

In [None]:
nan_sum_df = nan_sum_series.to_frame(name = 'count_nans')

In [None]:
nan_sum_df['prcnt_distrib'] = (nan_sum_df.count_nans / car_df.shape[0]) * 100

### How looks distribution of nan values to total for every column

In [None]:
nan_sum_df.prcnt_distrib.plot(kind='bar')

### We will work only with those columns, where prcnt_distribution less than 10%

In [None]:
chosen_cols = nan_sum_df[nan_sum_df.prcnt_distrib <= 10].index

In [None]:
chosen_cols

In [None]:
valid_car_df = car_df[chosen_cols]

In [None]:
valid_car_df.iloc[115:120]

In [None]:
valid_car_df.dtypes

Odometer, year will be set as integers, posting_date let's extract only '%Y-%M-%d'

Nan values will be changed by median / 0

In [None]:
valid_car_df.odometer.fillna(valid_car_df.odometer.median(), inplace=True)
valid_car_df.year.fillna(0, inplace=True)
valid_car_df.posting_date.fillna('-', inplace=True)
valid_car_df.odometer = valid_car_df.odometer.astype('int64')
valid_car_df.year = valid_car_df.year.astype('int64')
valid_car_df.posting_date = valid_car_df['posting_date'].apply(lambda x: x.split('T')[0])

In [None]:
valid_car_df.iloc[110:115]

## TOP 5 SOLD MARKS

In [None]:
valid_car_df.manufacturer.value_counts().sort_values(ascending=False)[:5].plot(kind='bar')

In [None]:
valid_car_df.model.fillna('empty', inplace=True)

In [None]:
valid_car_df.model.value_counts()[:20].plot(kind='bar')

#### As i see, models should be extracted, because we are not interested in full model type, just main model

In [None]:
valid_car_df.model.apply(lambda x: x.split(' ')[0]).value_counts()[:20].plot(kind='bar')

#### As we see above, after extracting first part, model_main, we get another shape. But looking into data, here more work needed to extract carefully correct model names. Best approach -> matching with already known models (get from wiki)

### Top 5 States with highest median price for cars:

In [None]:
valid_car_df.groupby('state').price.median().sort_values(ascending=False)[:5].plot(kind='bar')
plt.ylabel('price_median')

#### Look on max price, if it's enormous, let's drop those, which had price more than 3 million usd, as outliers

In [None]:
valid_car_df.price.max()

In [None]:
price_for_plot = valid_car_df[valid_car_df.price <= 3000000]\
                 .drop(valid_car_df[valid_car_df.price == 0].index)\
                 .price

In [None]:
def thousands(x, pos):
    if x >= 1000000:
        return '${:1.1f}M'.format(x*1e-6)
    else:
        return '${:,.0f}k'.format(x*1e-3)

#### Let's look on price distribution after taking those, which price < 3000000

In [None]:
fig, ax = plt.subplots()
ax.boxplot(price_for_plot)
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
               alpha=0.5)
ax.yaxis.set_major_formatter(thousands)
ax.set_ylabel('price_usd')
ax.set_xlabel('price_column')

### GET TOP 5 States Where Cars Had Highest Odometer readings

In [None]:
valid_car_df.groupby(['state']).odometer.median().sort_values(ascending=False)[:5].plot(kind='bar')
plt.ylabel('odometer_median')

### LET'S SEE HOW WAS CHANGING MEDIAN PRICE OVER TIME FOR TOP 2 STATES BY PRICE_MEDIAN

In [None]:
valid_car_df[(valid_car_df.posting_date.between('2021-04-01', '2021-04-30')) & (valid_car_df.state.isin(['wv', 'ak']))]\
.groupby(['posting_date', 'state']).price.median().unstack().plot(figsize = (16,8))