# Data Analysis using mtcars dataset

## Feature Infos

- mpg: Miles per Gallon (연비)
- cyl: Number of cylinders (엔진 실린더 개수)
- disp: Displacement (엔진 배기량)
- hp: Gross horsepower (최대 출력 마력)
- drat: Rear axle ratio (후륜 비율)
- wt: Weight (자동차의 무게)
- qsec: 1/4 mile time (가속 시간)
- vs: V/S 엔진 유형
- am: Transmission (Automatic / manual)
- gear: Number of forward gears (전진 기어 수)
- carb: Number of carburators (기화기 수)

## Import Library 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
DATASET_PATH = "w1m1_mtcars.csv"

In [None]:
df = pd.read_csv(DATASET_PATH)

## head, tail, ...

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.rename(columns={"Unnamed: 0": "car_model"}, inplace=True)

In [None]:
df.columns

In [None]:
# df.info()를 진행했을 떄, 컬럼 이름, 데이터 타입, 데이터 개수 등을 확인할 수 있다.
df.info()

In [None]:
df.dtypes

In [None]:
# df.describe()를 통해 수치형 데이터에 대한 기술통계량을 확인할 수 있다.
df.describe()

## Gear, Transmission

In [None]:
print("Number of unique values in gear columns:", len(df["gear"].unique()))
print("Number of unique values in transmission columns:", len(df["am"].unique()))

In [None]:
# visualize aggregation with pd.crosstab
aggregation = pd.crosstab(df["gear"], df["am"])

fig, ax = plt.subplots(figsize=(10, 6))
aggregation.plot(kind="bar", ax=ax)
plt.title("# of Cars by Gear x Transmission")
plt.xlabel("Gear x Transmission")
plt.ylabel("# of Cars")

In [None]:
# visualize aggregation without pd.crosstab
aggregation = df.groupby(["gear", "am"]).size().reset_index(name="count")
aggregation['am'] = aggregation['am'].replace({0: "Automatic", 1: "Manual"})
aggregation["aggr_name"] = "("+ aggregation["gear"].astype(str) + "-" + aggregation["am"] + ")"

plt.bar(
    aggregation["aggr_name"], 
    aggregation["count"],
)
plt.title("# of Cars by Gear x Transmission")
plt.xlabel("Gear x Transmission")
plt.ylabel("# of Cars")

In [None]:
# More Prettier
colors = ["skyblue", "salmon", "lightgreen", "violet"]

plt.bar(
    aggregation["aggr_name"], 
    aggregation["count"],
    color=colors,
    alpha=0.4,
    label=aggregation["aggr_name"] # legend 표시
)
plt.title("# of Cars by Gear x Transmission")
plt.xlabel("Gear x Transmission")
plt.ylabel("# of Cars")

# 각 막대에 데이터 개수 표시
for i, v in enumerate(aggregation["count"]):
    plt.text(i, v+0.1, v, ha="center")

plt.legend()
plt.tight_layout()
plt.show()

## Histograms of Numeric Columns

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(8, 8))
i = 0

for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:
        ax = axes[i//3, i%3]
        ax.hist(df[col])
        ax.set_title(col)
        ax.grid()
        i += 1
    else:
        continue
    
if i < 12:
    for idx in range(i, 12):
        fig.delaxes(axes.flatten()[idx])

fig.suptitle("Histograms of Numeric Columns")
plt.tight_layout()
plt.show()

## Cylinder vs HorsePower

In [None]:
plt.scatter(df["cyl"], df["hp"])
plt.title("Cylinder vs Horsepower")
plt.show()

## MPG vs HorsePower

In [None]:
plt.scatter(df["mpg"], df["hp"])
plt.title("MPG vs Horsepower")
plt.show()

## Visualize correlation

In [None]:
sub_df = df.drop(["car_model"], axis=1)
sns.heatmap(sub_df.corr(), annot=True, cmap="coolwarm")

## Advanced Study

In [None]:
def extract_corr_between_two_cols(df: pd.DataFrame, col1: str, col2: str) -> None:
    fig, ax = plt.subplots(figsize=(6, 6))
    sns.regplot(x=col1, y=col2, data=df, ax=ax)
    corr = df[col1].corr(df[col2])
    
    plt.title("Corr:{}".format(corr))
    plt.tight_layout()
    plt.show()

In [None]:
extract_corr_between_two_cols(df, "mpg", "hp")