# 1. Data Importing and Analyzing

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# For example, here's several helpful packages to load

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        df= pd.read_csv(os.path.join(dirname, filename))


print("Rows and Columns count:",df.shape)
df.head()


In [None]:
df.info()

In [None]:
df.describe().style.format('{:0.2f}')
# shape.format() - To aviod viewing data in exponential format

# 2. Data Cleansing

### A. Handling NULL Values

#### Method 1:

Handling while reading data through read_csv():

1.keep_default_na = True (Default), Keeping False will retain the ” , ‘#N/A’ , 'N/A' values as it is, else True for 'Nan'

2.na_values = ["$Vvalues"] , Pass values of columns or symbols for 'Nan'

In [None]:
df = pd.read_csv('/kaggle/input/usa-cers-dataset/USA_cars_datasets.csv',keep_default_na=False,na_values=['#','?','None'])

#### Method 2:

The most adapted way is to check ( isnull() ) the Null values('Nan') and perform either of below option:

i. dropna ( subset = ['col1','col2'] , how='all' ('any' Default) ) - with inplace set to True and subset set to a list of column names to drop all rows that contain NaN under those columns

ii. fillna (values or method='ffill') - Fill 'Nan' with custom based values 

In [None]:
df.isnull().sum()
#df.isnull().sum().sum() - To find total Null Values

### B. Drop / Replace / Mapping -- UDF Based Actions

i.   Let's Drop columns that are of no use

ii.  Let's Replace column values

iii. Let's map 'year' column to datetime

In [None]:
#Drop Columns
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df.head()

In [None]:
#Replace Columns
df.replace({'condition': r' left$'},{'condition': ''},regex=True,inplace=True)
df.head(3)

In [None]:
#Mapping Datatime to Year column

print("Before mapping:",df.dtypes['year'])

def lookup(a):
    
    """
    This is an extremely fast approach to datetime parsing through mapping.
    For large data, the same years are often repeated. Rather than
    re-parsing every record, we store all unique years, parse them in 'to_datetime', and
    use a lookup to map all other records through Key-Value pairs.
    
    """
    
    val = {i:pd.to_datetime(i,format='%Y') for i in a.unique()}
    return a.map(val)

df['modified_year'] = lookup(df['year'])

print("After mapping: ",df.dtypes['modified_year'])


### C. Changing the Datatypes


In [None]:
# Changing dtype of 'title_status' to Categorical (ordinal)

print("Mmeory Usage of title_status before: ",df.title_status.memory_usage())
print("Dtype before:", df.dtypes['title_status'] ,", Unique values",df.title_status.unique())


In [None]:
lst = ['clean vehicle','salvage insurance']
df['title_status']=df.title_status.astype(pd.CategoricalDtype(lst,ordered=True))

print("Memory Usage of title_status after: ",df.title_status.memory_usage())
print("Dtype before:", df.dtypes['title_status'] ,", Unique values",df.title_status.unique())

In [None]:
df.head(3)

### D. Data Standardization and Normalization:

Standarization : It is the process of transforming data into a common format which allows the researcher to make meaningful comparisons.

Eaxmple:
kilometer = 1.609 * mileage



Normalization: It is the process of transforming values of several variables into similar range.typical normalization include scaling the variable.average is zero and variance is one, or scaling variable so the variable values range from 0 to 1

Example: To demonstrate normalization,let's say we want scale the following features by dividing each value in the column by the maximum value in the column.

Columns: mileage,kilometer_age,price approach

In [None]:
#Standardization

df['kilometer'] = 1.609 * df['mileage']

# check our transformed data
df.head(3)

In [None]:
#Normalization

df['mileage'] = df['mileage'] / df['mileage'].max()
df['kilometer'] = df['kilometer'] / df['kilometer'].max()
df['price_mod'] = df['price'] / df['price'].max()
df[['mileage','kilometer','price_mod']].head()

### E. Data Bining

It is the process of transforming continous numrical variable into discrete categorical 'bins', for grouped analysis.

Example: in our dataset year is real value ranging from 1973 to 2020, it has 30 unique values.

what if we care about the car price difference in two periods before the 2000 year and after 2000 year.
we will use the pandas method cut() to segement the year column into two bins: before_2008 and after_2008.

In [None]:
df.year.value_counts().head(10)

##shows significant increase in production after 2010, so lets group 9years together through bining

In [None]:
val = np.linspace(min(df['year']),max(df['year']),6,dtype='int')
print(val)

group_names = ['1973-1982','1982-1991','1991-2001','2001-2010','2010-2020']

In [None]:
df['year_binned'] = pd.cut(df['year'],bins=val,labels=group_names,include_lowest=True)
df.head(3)

### F. Detecting the Outliers

Most of the times for Exploratory Data Analysis (EDA), outlier detection is an important segment, as, outlier for particular features may distort the true picture, so we need to disregard them. Specifically, outliers can play havoc when we want to apply machine learning algorithm for prediction. At the same time outliers can even help us for anomaly detection.

Seaborn Box Plot: Box plot is a standard way of visualizing distribution of data based on median, quartiles and outliers.


In [None]:
print("shape before :", df.shape)
df_numerical = df.select_dtypes(exclude=['object'])
print("shape after excluding object columns: ", df_numerical.shape)

In [None]:
df_numerical.head(3)
#Keeping only the required numerical columns

df_numerical=df_numerical[['price_mod','kilometer']]

In [None]:
df_numerical.plot(kind='box',figsize=(6,6))

'''
It is understandable as price as gone up with year and hence we could find outliers.
Also, kilometer too has huge outliers as better mileage cars are produced with time.

'''

In [None]:
# Let's compare price and year 

df1=df[['price_mod','year_binned']]
df1.boxplot(column='price_mod',by='year_binned',figsize=(6,6))

''' We will retain the outliers in this scenario as its in acceptable range. (Could also remove them based on convenience) '''

### Data Skewness:

Why do we care if the data is skewed? 

If the response variable is skewed, the model will be trained on a much larger number of moderately priced cars, and will be less likely to successfully predict the price for the most expensive cars.
The concept is the same as training a model on imbalanced categorical classes. If the values of a certain independent variable (feature) are skewed, depending on the model, skewness may violate model assumptions (e.g. logistic regression) or may impair the interpretation of feature importance.

In [None]:
sns.distplot(df['price'])
print("Skewness: %f" % df['price'].skew())

Positive Skewness can be corrected by:

i. Square Root

ii. Logarithmic

iii. Reciprocal

# 3. Data Exploration and Visualization

### PLOTTING THE DISTRIBUTION OF  CAR PRODUCTION OVER THE YEARS

In [None]:
''' We see 95.8% occupancy in production during 2010-2020 '''
print("Decade wise occupancy\n",df.year_binned.value_counts(normalize=True))

print("\n")

#Let's see for last 5 years
print("Last 5 years\n",df[df.year>2015].year.value_counts(normalize=True))

resdf=df[df.year>2015].year.value_counts(normalize=True)

resdf.plot.bar(x=resdf.index,y=resdf.values,rot=0,figsize=(8,4),color='c',label="Count")
plt.title("Last 5years Production Rate",fontsize=15)
plt.xlabel('Year',fontsize=12)
plt.ylabel('Count',fontsize=12)
plt.tight_layout()
plt.legend()
plt.show()


#### Inference - 

i. 2019 has seen twice the production sales of cars when comapred to 2018

ii. 2018 and 2017 almost had same rate of production of cars

### PLOTTING THE BREAKDOWN OF THE CARS SOLD BY BRAND

In [None]:
pt=df['brand'].value_counts()
pt.index

In [None]:
fig = plt.figure(figsize=(12, 5))
ax = fig.add_axes([0,0,1,1])
sns.countplot(x='brand', data=df, ax=ax)

ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
#Finding sales of Top 5 brands 

print(df.brand.value_counts(normalize=True).head(5))
dfpt = df.brand.value_counts().reset_index().rename(columns={'index':'brand','brand':'count'}).head(5)

In [None]:
dfpt.plot.barh(x='brand',y='count')
plt.xlabel('Brand')
plt.ylabel('Count')
plt.tight_layout()
plt.show()


#### Inference - 
i. Ford has been the leading Brand with more than 1200+ sales

ii. Nissan and Chevrolet had equal occupancy in sales but below Dodge 

In [None]:
# Ford has highest production sales, let's look into it's top 'model' for last decade 2010-2020

resdf=df[(df.brand=='ford') & (df.year_binned=='2010-2020')].groupby(['brand','model','year_binned']).size().to_frame('count').reset_index().sort_values(['count'],ascending=False)


#Let's say we want to see top 'model' that has count more than 40, hence updating all lower count below 50 to 'Others'

resdf.loc[resdf['count'] <= 50,'model']='Others'
resdf=resdf.groupby(['model']).aggregate(np.sum).reset_index()  #suming all counts based on grouping the 'model'
resdf.head(3)

In [None]:
#Sketching Bar and Pie Charts

fig = plt.figure(figsize = (8,10))
axes = fig.subplots(nrows=2)
fig.subplots_adjust(hspace=0.7)
axes[0].bar(resdf['model'],resdf['count'])
axes[0].tick_params('x', labelsize=12)
axes[0].set_title('Ford-Model', fontsize=18)
axes[0].set_xlabel('Model Variants', fontsize=14, labelpad=14)
axes[0].set_ylabel('Count of Model', fontsize=14)

axes[1].pie(resdf['count'], labels=resdf['model'],  autopct='%.1f', textprops={'fontsize':14})
axes[1].set_title('Percentage Distribution of Models', fontsize=18)

### Let's compare top Brands

In [None]:
df1 = df[df.year>2015].groupby(['year','brand','modified_year']).size().to_frame('Count').reset_index()

df2=df1[df1['Count']>50] #Brands with more than 50 prduction annually 


#df3 for Bar Graph
df3=pd.pivot_table(df2,values='Count',index=['year'],columns=['brand'],fill_value=0)

#df4 for Line Graph
df4=pd.pivot_table(df2,values='Count',index=['modified_year'],columns=['brand'],fill_value=0)

In [None]:
df3.plot.bar(rot=0,figsize=(8,4))
plt.xlabel('Year',fontsize=14)
plt.ylabel('Count',fontsize=14)

In [None]:
df4.plot.line(rot=0,figsize=(8,4))
plt.xlabel('Year',fontsize=14)
plt.ylabel('Count',fontsize=14)

#### Inference - 

i. Ford has the leading sales numbers compared to its competitors

ii. In 2018, Ford and Nissan almost had same number of models rolling out.

### Checking Avg. Price based on State-Wise distribution  - USA

In [None]:
df1=df.copy()
df1['average_year_price']=df1.groupby('year')['price'].transform('mean')
df1.head(3)

In [None]:
df1['average_state_price'] = df1.groupby('state')['price'].transform('mean')
df1 = df1[df1['country'] != ' canada']
df1['average_state_price'].head(3)

In [None]:
fig = plt.figure(figsize=(12, 5))
ax = fig.add_axes([0,0,1,1])
sns.barplot(x=df1['state'], y=df1['average_state_price'], ax=ax)
plt.tight_layout()
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()

### Checking Avg. Price based on Vehicle Color - USA

In [None]:
df1=df.copy()
df1['average_color_price'] = df1.groupby('color')['price'].transform('mean')
df1 = df1[df1['country'] != ' canada']
df1.head(3)

In [None]:
fig = plt.figure(figsize=(12, 5))
ax = fig.add_axes([0,0,1,1])
sns.barplot(x=df1['color'], y=df1['average_color_price'], ax=ax)
plt.tight_layout()
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()

### Price dependancy on Mileage

In [None]:
 df.plot.scatter(x="mileage", y="price")

#### Inference -

i. Most cars ranging upto 20000$ have a better mileage between 0.1 to 0.3

ii. Cars ranging 20000$ and above have mileage less than 0.2

Note: Mileage column had been normalized ranging from (0 to 1)

### Analyzing the Heat Map

In [None]:
matrix = np.triu(df.corr())
sns.heatmap(df.corr(), annot=True, mask=matrix)

#### Inference - 

i. Year and Price are relatively correlated, meaning change in Year will impact the Price.

ii. Mileage and Year have poor correlation factor.

### Note: How to choose the right Visualization method for Analysis?

1.Well it's important to know the correlation between the columns that you are dealing with. This can be found out using tht above HeatMap and find how closely two columns are related.

2.Depending on their Column Type and also based on how the data is scattered, one can decide on the right visualization chart.