# Getting Set up


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
# from scipy.stats import pearsonr
import scipy.stats as stats 
from sklearn import preprocessing

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/estimate-car-price/CarPrices.csv')
# df = pd.read_csv('/Users/jimcody/Documents/CarPrices.csv')
#../input/home-data-for-ml-course/train.csv

In [None]:
# df = pd.read_csv('/kaggle/input/carpricesjc/CarPrices.csv')

In [None]:
df.head()    # Show the first five rows

In [None]:
df.tail()    # Show the last five rows

In [None]:
df.shape     # Rows and columns in the dataset

In [None]:
# df.drop(df[df.price > 25000].index, inplace=True)

In [None]:
df.info()    # How python views the data types
             # If the number next to the variable name does not equal the number of rows in the shape
             # it means there is missing data

In [None]:
df.describe()     # describe calculates some basic statistics for quantitative data   

# A little data cleaning

1. Change . column type
2. Check for missing values
3. Check for duplicate data
4. Remove data that will not add value (e.g., car id)

### Changing the column type

In [None]:
# changing the datatype of symboling as it is categorical variable as per dictionary file

df['symboling'] = df['symboling'].astype(str)
df.info()

### Missing Data

In [None]:
# Just listing the columns and how many rows 
# for each have a missing value.

df.isnull().sum()

In [None]:
# Calculating the Missing Values % contribution in DF

df_null = df.isna().mean().round(4) * 100

df_null.sort_values(ascending=False).head()

In [None]:
# Plotting missing values

sns.heatmap(df.isnull(), cbar=False)

### Remove Rows (including duplicates)

In [None]:
# checking for duplicates

df.loc[df.duplicated()]

In [None]:
# This will drop all duplicate rows

df.drop_duplicates(keep = 'first', inplace = True) 

# keep - which duplicate to keep, default is none!

In [None]:
df.loc[df.duplicated()]

In [None]:
df.shape

In [None]:
# df.drop(df.index[205])   # Drop a specific row index  This is not working properly

### Removing a single column

In [None]:
df = df.drop('car_ID',axis=1)  
df.head()

### Removing multiple columns

In [None]:
drop_columns = {'symboling','wheelbase','compressionratio','stroke','boreratio','wheelbase' }
df = df.drop(columns = drop_columns) # inplace=True not used so columns still exist. Just not in this instance.
df.head()

# Exploratory Data Analysis

# Examine the Dependent Variable - Price

In [None]:
plt.figure(figsize=(30,300))

dfx = pd.DataFrame(df.groupby(['CarName'])['price'].mean().sort_values(ascending = False))
dfx.plot.bar()
plt.title('Car Company Name vs Average Price')
plt.show()

In [None]:
# Basic statistics for price
df['price'].describe()

# round(df['price'].describe(),2)

In [None]:
# Plot a histogram of price
sns.distplot(df['price'], kde = True).set_title('Histogram of Price')

In [None]:
# Basic Distribution (Histogram) plot

plt.figure(figsize=(8,8))
plt.title('Car Price Distribution Plot')
sns.distplot(df['price'],bins=50,kde=True)   # This allows control over the number of bins

In [None]:
# Basic boxplot

sns.boxplot(y="price", data=df,palette='rainbow')
# sns.boxplot(y="price", data=df,palette='rainbow',orient='h')  change the orientation

In [None]:
# Boxplot with a little more detail

outliers = ['price']
plt.rcParams['figure.figsize'] = [5,5]
sns.boxplot(data=df[outliers], orient="v", palette='Set3' ,whis=1.5,saturation=1, width=0.7)
plt.title("Price Boxplot", fontsize = 14, fontweight = 'bold')
plt.ylabel("Price Range", fontweight = 'bold')
plt.xlabel("Continuous Variable", fontweight = 'bold')
df.shape

In [None]:
# Boxplot stratified by carbody

sns.boxplot(x='carbody', y='price', data=df,palette='rainbow')

In [None]:
# Violinplot stratified by carbody

sns.violinplot(y='price', data=df,palette='rainbow')

# Case Study Histograms

In [None]:
# Plot a histogram of curbweight
sns.distplot(df['curbweight'], kde = True).set_title('Histogram of Curb Weight')

In [None]:
# Plot a histogram of citympg
sns.distplot(df['citympg'], kde = True).set_title('Histogram of City MPG')

In [None]:
# Plot a histogram of carheight
sns.distplot(df['carheight'], kde = True).set_title('Histogram of Car Height')

# Case Study Boxplots

In [None]:
outliers = ['curbweight']
plt.rcParams['figure.figsize'] = [5,5]
sns.boxplot(data=df[outliers], orient="v", palette='Set3' ,whis=1.5,saturation=1, width=0.7)
plt.title("Curb Weight Boxplot", fontsize = 14, fontweight = 'bold')
plt.ylabel("Curb Weight Range", fontweight = 'bold')
# plt.xlabel("Continuous Variable", fontweight = 'bold')
# df.shape

In [None]:
outliers = ['citympg']
plt.rcParams['figure.figsize'] = [5,5]
sns.boxplot(data=df[outliers], orient="v", palette='Set3' ,whis=1.5,saturation=1, width=0.7)
plt.title("City MPG Boxplot", fontsize = 14, fontweight = 'bold')
plt.ylabel("City MPG Range", fontweight = 'bold')

In [None]:
outliers = ['carheight']
plt.rcParams['figure.figsize'] = [5,5]
sns.boxplot(data=df[outliers], orient="v", palette='Set3' ,whis=1.5,saturation=1, width=0.7)
plt.title("Car Height Boxplot", fontsize = 14, fontweight = 'bold')
plt.ylabel("Car Height Range", fontweight = 'bold')

# Case Study IV/DV Relationships

In [None]:
sns.pairplot(df)

In [None]:
# Calculate correlations
corr = df.corr()
plt.figure(figsize=(12,8))
plt.title('Quantitative Variables Correlation')

# Heatmap
sns.heatmap(corr,cmap='plasma',annot=True)

In [None]:
sns.jointplot(x='curbweight',y='price',data=df,kind='reg')

In [None]:
sns.jointplot(x='citympg',y='price',data=df,kind='reg')

In [None]:
sns.jointplot(x='carheight',y='price',data=df,kind='reg')

# Case Study IV/IV Relationships

In [None]:
sns.jointplot(x='curbweight',y='citympg',data=df,kind='reg')

In [None]:
sns.jointplot(x='curbweight',y='carheight',data=df,kind='reg')

In [None]:
sns.jointplot(x='citympg',y='carheight',data=df,kind='reg')

# Case Study Simple Linear Regressions

In [None]:
model = sm.OLS.from_formula('price ~ citympg', data=df)
result = model.fit()
result.summary()

In [None]:
model = sm.OLS.from_formula('price ~ curbweight', data=df)
result = model.fit()
result.summary()

In [None]:
model = sm.OLS.from_formula('price ~ carheight', data=df)
result = model.fit()
result.summary()

# Case Study Multiple Regressions

In [None]:
model = sm.OLS.from_formula('price ~ carheight+curbweight', data=df)
result = model.fit()
result.summary()

In [None]:
model = sm.OLS.from_formula('price ~ carheight+citympg', data=df)
result = model.fit()
result.summary()

In [None]:
model = sm.OLS.from_formula('price ~ citympg+curbweight', data=df)
result = model.fit()
result.summary()

In [None]:
model = sm.OLS.from_formula('price ~ carheight+curbweight+citympg', data=df)
result = model.fit()
result.summary()

# Add Categorical Data to the Regression

In [None]:
df.head()

In [None]:
model = sm.OLS.from_formula('price ~ curbweight + carbody', data=df)
result = model.fit()
result.summary()

# Some additional data manipulation

## Creating variables to hold the categorical data and the quantitative data

In [None]:
# Segregation of Numerical and Categorical Variables/Columns

cat_col = df.select_dtypes(include=['object']).columns
num_col = df.select_dtypes(exclude=['object']).columns
df_cat = df[cat_col]
df_num = df[num_col]

### Fixing car names

In [None]:
# Extracting Car Company from the CarName as per direction in Problem 

df['CarName'] = df['CarName'].str.split(' ',expand=True)

In [None]:
# Unique Car company

df['CarName'].unique()

**Typo Error in Car Company name**
- maxda = mazda
- Nissan = nissan
- porsche = porcshce
- toyota = toyouta
- vokswagen = volkswagen = vw

In [None]:
# Renaming the typo errors in Car Company names

df['CarName'] = df['CarName'].replace({'maxda': 'mazda', 'nissan': 'Nissan', 'porcshce': 'porsche', 'toyouta': 'toyota', 
                           'vokswagen': 'volkswagen', 'vw': 'volkswagen'})

In [None]:
df['CarName'].unique()

In [None]:
#Binning the Car Companies based on avg prices of each car Company.

df['price'] = df['price'].astype('int')
df_temp = df.copy()
t = df_temp.groupby(['CarName'])['price'].mean()
df_temp = df_temp.merge(t.reset_index(), how='left',on='CarName')
bins = [0,10000,20000,40000]
label =['Budget_Friendly','Medium_Range','TopNotch_Cars']
df['Cars_Category'] = pd.cut(df_temp['price_y'],bins,right=False,labels=label)
df.head()