# Model In a Single Pipeline

In [20]:
# data preprocessing and feature engineering
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
#####################################################
# read csv file
dataset= pd.read_csv("phones.csv")
# rename some column names
dataset.rename(columns={"Storage (GB)":"Storage","RAM (GB)":"RAM","Screen Size (in)":"Screen_size_inches","Internet Network":"Network","Price (USD)":"Price",
                       "Performance Score":"Perf_Score"},inplace=True)
# drop the Price column missing values
dataset.dropna(subset=["Price"],inplace=True)
# drop unnecessary columns
dataset = dataset.drop("Phone Name",axis = 1)
# turn object dtypes into integer with some basic replacements
dataset["Network"] = (dataset["Network"]
    .str.replace("G","",regex = False)
    .str.replace("LTE","",regex = False)
    .str.replace(" ","",regex = False)
.astype(int)
                    )
######################################################
#separate the lables and features
X = dataset.drop("Price", axis= 1)
Y = dataset["Price"]
###############################################################################
# if your dataset contains missing values, fill it using transformere pipeline
# filling categorical features with a constant value(string) like missing/unavailable
cat_features = ["Manufacturer","OS"]
categ_tranformer = Pipeline(
    steps = [
    ("imputer",SimpleImputer(strategy="constant",fill_value="missing")),
    ("onehot",OneHotEncoder(handle_unknown = "ignore"))
           ]
)

# filling Network feature columns with constant value 4
Network =  ["Network"]
netw_tranformer = Pipeline(steps = [
    ("imputer",SimpleImputer(strategy= "constant",fill_value = 4))
]
                          )

# filling numeric feature columns with mean of that column
num_features = ["RAM"]
num_tranformer = Pipeline ( steps = [
    ("imputer",SimpleImputer(strategy = "mean"))
     ]
                          )

                
###################################################################
# now setting up the preprocessing steps(fill nan values and then convert to numbers)
preprocessing = ColumnTransformer(
    transformers=[
        ("categorical",categ_tranformer,cat_features),
        ("network",netw_tranformer,Network),
        ("numerical",num_tranformer,num_features)
    ]
)
##################################################################
# specify the model and steps to be performed by it
model = Pipeline(steps=[
    ("preprocessing",preprocessing),
    ("model",RandomForestRegressor())
])
######################################## Split the dataset
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.22)
######################################## Train and Test Model
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.7181601599661249