# Training and testing an Scikit learn model

## 1. loading and preparing the dataset(test and train dataset)

In [42]:
# import necesssary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [43]:
phone_ds = pd.read_csv("./../phones.csv")
#add an extra column
phone_ds["Generation"] = np.array([5,4,3,5,6,7,4,3,6,5])
phone_ds

Unnamed: 0,Phone Name,Manufacturer,Storage,Memory,Price (USD),Internet Network,Generation
0,Galaxy S22 Ultra,Samsung,512GB,12GB,1099,5G,5
1,iPhone 14 Pro,Apple,256GB,6GB,999,5G,4
2,Pixel 7,Google,128GB,8GB,599,5G,3
3,OnePlus 10T,OnePlus,256GB,16GB,649,5G,5
4,Redmi Note 11,Xiaomi,128GB,4GB,249,4G LTE,6
5,Moto G Power,Motorola,64GB,4GB,199,4G LTE,7
6,Nokia G400,Nokia,128GB,6GB,299,5G,4
7,P50 Pro,Huawei,256GB,8GB,899,4G LTE,3
8,Oppo Find X5,Oppo,256GB,12GB,799,5G,6
9,Vivo X80,Vivo,512GB,12GB,749,5G,5


In [45]:
phone_ds['Storage'] = phone_ds['Storage'].str.replace("GB","",regex=False).astype(int) # they are done once

In [46]:
phone_ds['Memory'] = phone_ds['Memory'].str.replace("GB","",regex=False).astype(int) # this is also done

In [44]:
# also convert the network type to integers
phone_ds['Internet Network'] = phone_ds['Internet Network'].str.replace("G","",regex=False)
phone_ds['Internet Network'] = phone_ds['Internet Network'].str.replace("LTE","",regex=False).astype(int)

In [47]:
phone_ds.dtypes

Phone Name          object
Manufacturer        object
Storage              int64
Memory               int64
Price (USD)          int64
Internet Network     int64
Generation           int64
dtype: object

### sklearn models donot train on datasets having String or Object Data types, if you try to train the upper dataset you will get error
### so what we will do here is to encode the "strings" and "object" data types, using oneHotEncoding technique of SKLearn library

In [48]:
# LET ME PREPARE THE X AND Y TEST TRAIN DATASETS
X = phone_ds.drop("Price (USD)",axis=1)
Y = phone_ds["Price (USD)"]

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
# FIT THE MODEL
from sklearn.ensemble import RandomForestRegressor
"""does not work until you encode and then transform the column having string values into float"""
# RFR = RandomForestRegressor() d
# RFR.fit(x_train,y_train)
# RFR.score(x_test,y_test)


'does not work until you encode and then transform the column having string values into float'

In [51]:
# import the required libraries/packages I will use for encoding and transformation
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# step1 : select the features you want to encode
encode_features = ["Phone Name","Manufacturer"]

# step 2: make an instance of the oneHotEncoder and transformer class respectively
OneHen = OneHotEncoder()

# creating the transformer instance with arguments as follows
# 1. one_hot_encode as the transformer name for future use
# 2. the name of the encoder
# 3. name of the features to be encoded and
# and one other arg/parameter is the remainder, which take cares of the remaining features of the dataset

transformer = ColumnTransformer([("one_hot_encode",OneHen,encode_features)],
                               remainder = "passthrough")

# performing the transform operation on the X data component of the dataset

transformed_X = transformer.fit_transform(X)

# turn into a dataframe
pd.DataFrame(transformed_X)


Unnamed: 0,0
0,<Compressed Sparse Row sparse matrix of dtype ...
1,<Compressed Sparse Row sparse matrix of dtype ...
2,<Compressed Sparse Row sparse matrix of dtype ...
3,<Compressed Sparse Row sparse matrix of dtype ...
4,<Compressed Sparse Row sparse matrix of dtype ...
5,<Compressed Sparse Row sparse matrix of dtype ...
6,<Compressed Sparse Row sparse matrix of dtype ...
7,<Compressed Sparse Row sparse matrix of dtype ...
8,<Compressed Sparse Row sparse matrix of dtype ...
9,<Compressed Sparse Row sparse matrix of dtype ...


## second method of transformation

In [53]:
_transform = pd.get_dummies(phone_ds[["Phone Name","Manufacturer"]])
_transform

Unnamed: 0,Phone Name_Galaxy S22 Ultra,Phone Name_Moto G Power,Phone Name_Nokia G400,Phone Name_OnePlus 10T,Phone Name_Oppo Find X5,Phone Name_P50 Pro,Phone Name_Pixel 7,Phone Name_Redmi Note 11,Phone Name_Vivo X80,Phone Name_iPhone 14 Pro,Manufacturer_Apple,Manufacturer_Google,Manufacturer_Huawei,Manufacturer_Motorola,Manufacturer_Nokia,Manufacturer_OnePlus,Manufacturer_Oppo,Manufacturer_Samsung,Manufacturer_Vivo,Manufacturer_Xiaomi
0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True
5,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
6,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
7,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False
8,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
9,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False


## lets fit the model again

In [91]:
x_train,x_test,y_train,y_test = train_test_split(transformed_X,Y,test_size=0.3)
RFR = RandomForestRegressor()
RFR.fit(x_train,y_train)
RFR.score(x_test,y_test)


0.622570238095238