
# üß† Desafio Final ‚Äì Bootcamp Engenheiro(a) de Machine Learning
Este notebook apresenta a resolu√ß√£o completa do desafio final do m√≥dulo de **Engenharia de Machine Learning**, utilizando o dataset `cars.csv`.  
O pipeline segue os **7 passos fundamentais do Engenheiro de Machine Learning**, contemplando desde a an√°lise explorat√≥ria at√© a modelagem supervisionada.


## ü•á Passo 1 ‚Äì Compreens√£o do problema e dos dados

In [None]:

import pandas as pd
import numpy as np

# Leitura do dataset (ajuste o caminho conforme o seu ambiente)
df = pd.read_csv('/content/drive/MyDrive/cars.csv')

# Visualiza√ß√µes iniciais
print("Formato do dataset:", df.shape)
print("\nColunas dispon√≠veis:\n", df.columns.tolist())
display(df.head())
print("\nInforma√ß√µes gerais:")
df.info()
display(df.describe())


## üß© Passo 2 ‚Äì Coleta e explora√ß√£o inicial do dataset

In [None]:

# Verifica√ß√£o de valores ausentes e duplicados
print("Valores ausentes por coluna:")
print(df.isnull().sum())

print("\nLinhas duplicadas:", df.duplicated().sum())

# Valores √∫nicos por coluna
print("\nValores √∫nicos por coluna:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} valores √∫nicos")

# Identifica√ß√£o de tipos de dados
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

print("\nVari√°veis num√©ricas:", num_cols)
print("Vari√°veis categ√≥ricas:", cat_cols)


## üßπ Passo 3 ‚Äì Pr√©-processamento (limpeza e normaliza√ß√£o com `pd.to_numeric`)

In [None]:

from sklearn.preprocessing import StandardScaler

# Convers√£o segura de colunas num√©ricas com errors='coerce'
for col in ['cubicinches', 'weightlbs']:
    df[col] = df[col].astype(str).str.replace(',', '').str.strip()
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Substituir valores nulos pela m√©dia
df['cubicinches'].fillna(df['cubicinches'].mean(), inplace=True)
df['weightlbs'].fillna(df['weightlbs'].mean(), inplace=True)

# Criar coluna de efici√™ncia com base no mpg
df['efficiency'] = pd.cut(df['mpg'], bins=[0, 20, 30, df['mpg'].max()], labels=['Low', 'Medium', 'High'])

# Normaliza√ß√£o das vari√°veis num√©ricas
scaler = StandardScaler()
num_features = ['cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60']

df_scaled = df.copy()
df_scaled[num_features] = scaler.fit_transform(df[num_features])

print("Tipos de dados ap√≥s convers√£o:\n", df_scaled.dtypes)
display(df_scaled.head())


## üîç Passo 4 ‚Äì An√°lise explorat√≥ria e correla√ß√µes

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Matriz de correla√ß√£o
corr = df_scaled[['mpg', 'cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60']].corr()

print("Correla√ß√£o com a vari√°vel mpg:\n")
print(corr['mpg'].sort_values(ascending=False))

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='RdBu', center=0)
plt.title('Matriz de Correla√ß√£o ‚Äì Vari√°veis Num√©ricas')
plt.show()


## üß≠ Passo 5 ‚Äì Redu√ß√£o de dimensionalidade com PCA

In [None]:

from sklearn.decomposition import PCA

X = df_scaled[['cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60']]
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)

explained_var = pca.explained_variance_ratio_
print("Vari√¢ncia explicada por componente:\n", explained_var)
print("\nVari√¢ncia acumulada:", explained_var.cumsum())

plt.figure(figsize=(8,5))
plt.plot(range(1, len(explained_var)+1), explained_var.cumsum(), marker='o')
plt.xlabel('N√∫mero de Componentes Principais')
plt.ylabel('Vari√¢ncia Acumulada Explicada')
plt.title('PCA ‚Äì Vari√¢ncia Explicada Acumulada')
plt.grid(True)
plt.show()


## üßÆ Passo 6 ‚Äì Clusteriza√ß√£o com K-Means

In [None]:

from sklearn.cluster import KMeans

X_pca2 = X_pca[:, :2]
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_pca2)

df_scaled['cluster'] = clusters

plt.figure(figsize=(8,6))
plt.scatter(X_pca2[:, 0], X_pca2[:, 1], c=clusters, cmap='viridis', s=60)
plt.title('Clusters de Ve√≠culos (K-Means + PCA)')
plt.xlabel('Componente Principal 1')
plt.ylabel('Componente Principal 2')
plt.grid(True)
plt.show()

print("Distribui√ß√£o de ve√≠culos por cluster:")
print(df_scaled['cluster'].value_counts())


## üß† Passo 7 ‚Äì Modelagem supervisionada e avalia√ß√£o

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df_scaled[['cylinders', 'cubicinches', 'hp', 'weightlbs', 'time-to-60']]
y = df_scaled['efficiency']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

log_reg = LogisticRegression(random_state=42, max_iter=500)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

print("=== √Årvore de Decis√£o ===")
print("Acur√°cia:", accuracy_score(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

print("\n=== Regress√£o Log√≠stica ===")
print("Acur√°cia:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

import seaborn as sns
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_tree), annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('√Årvore de Decis√£o')
sns.heatmap(confusion_matrix(y_test, y_pred_log), annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('Regress√£o Log√≠stica')
plt.show()
