## workflow of Scikit learn

#0. Get the data ready
#1. Pick a model (to suit the problem)
#2. Fit the model to the training data and make a prediction
#3. Evaluate the model
#4. Improve through experimentation
#5. Save the model and reload your trained model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## car_sales data scikit learn workflow`

## deal with missing values

In [None]:
car_sales_missing = pd.read_csv("/kaggle/input/scikitconf/car-sales-extended-missing-data.csv")
car_sales_missing

In [None]:
car_sales_missing.isna().sum()

# 2 ways to handle missing data
#1. drop them
#2. Imputation: Filling the missing value with some number.

In [None]:
X = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

In [None]:
#converting categorical values to numerical

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
categorical_values = ["Make","Colour","Doors"]
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),categorical_values)],remainder='passthrough')
transformed_X = ct.fit_transform(X)

## Fill the missing data with pandas

In [None]:
car_sales_missing["Make"].fillna("missing",inplace=True)
car_sales_missing["Colour"].fillna("missing",inplace=True)
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(),inplace=True)
car_sales_missing["Doors"].fillna(4,inplace=True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
car_sales_missing.dropna(inplace=True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
X = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

In [None]:
#converting categorical values to numerical

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
categorical_values = ["Make","Colour","Doors"]
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),categorical_values)],remainder='passthrough')
transformed_X = ct.fit_transform(X)
transformed_X 

## Fill the missing data with Scikit-Learn

In [None]:
car_sales_missing = pd.read_csv("/kaggle/input/scikitconf/car-sales-extended-missing-data.csv")
car_sales_missing.isna().sum()

In [None]:
# removing missing values from target label

car_sales_missing.dropna(subset=["Price"],inplace=True)
car_sales_missing.isna().sum()

In [None]:
#split into X and y
X = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

In [None]:
# split into train and test data
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

Note: We split data into train & test to perform filling missing values on them separately.

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill the categorical values with 'missing' and numerical values with mean

cat_imputer = SimpleImputer(strategy="constant",fill_value="missing")
door_imputer = SimpleImputer(strategy="constant",fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

In [None]:
# Defining different columns'

cat_features = ["Make","Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

In [None]:
imputer = ColumnTransformer([("cat_imputer",cat_imputer,cat_features),
                             ("door_imputer",door_imputer,door_feature),
                             ("num_imputer",num_imputer,num_features)])
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.transform(X_test)
filled_X_train

In [None]:
#converting transformed data array into data frame

car_sales_filled_train = pd.DataFrame(filled_X_train,
                                      columns=["Make","Colour","Doors","Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test,
                                     columns=["Make","Colour","Doors","Odometer (KM)"])

In [None]:
# lets one hot encode the features


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
categorical_values = ["Make","Colour","Doors"]
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),categorical_values)],remainder='passthrough')
transformed_X_train = ct.fit_transform(car_sales_filled_train)
transformed_X_test = ct.transform(car_sales_filled_test)
transformed_X_train.toarray()

In [None]:
# Fit the model

np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(transformed_X_train,y_train)
model.score(transformed_X_test,y_test)

## Choosing the Right estimators for your Problem

Scikit-Learn uses estimators as another term for machine learning algorithms or model.
* Classification - predicting whether a sample is one thing or another
* Regression - predicting a number