Importing the Libraries

In [21]:
import numpy as np
import pandas as pd


Importing the Dataset

In [22]:
dataset = pd.read_csv('Best Selling Mobile Phones.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values

In [23]:
print(X)

[[1 ' iPhone 12 and iPhone 12 Mini ' 'Apple' 38.0]
 [2 ' Galaxy S20, S20+, S20 Ultra ' 'Samsung' 28.0]
 [3 ' iPhone SE 2nd generation ' 'Apple' 24.2]
 [4 ' Galaxy A21s ' 'Samsung' 19.4]
 [5 ' iPhone 12 Pro Max ' 'Apple' 16.8]
 [6 ' Galaxy A11 ' 'Samsung' 15.3]
 [7 ' Redmi Note 9 Pro ' 'Xiami' 15.0]]


In [24]:
print(Y)

['Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes']


Taking Care of missing data

In [25]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X[:,:1])
X[:,:1] = imputer.transform(X[:,:1])

In [26]:
print(X)

[[1.0 ' iPhone 12 and iPhone 12 Mini ' 'Apple' 38.0]
 [2.0 ' Galaxy S20, S20+, S20 Ultra ' 'Samsung' 28.0]
 [3.0 ' iPhone SE 2nd generation ' 'Apple' 24.2]
 [4.0 ' Galaxy A21s ' 'Samsung' 19.4]
 [5.0 ' iPhone 12 Pro Max ' 'Apple' 16.8]
 [6.0 ' Galaxy A11 ' 'Samsung' 15.3]
 [7.0 ' Redmi Note 9 Pro ' 'Xiami' 15.0]]


Encoding categorical data

In [27]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_indices = [1, 2]
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_indices)], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [28]:
print(X)

[[0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 38.0]
 [0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 2.0 28.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 3.0 24.2]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 4.0 19.4]
 [0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 5.0 16.8]
 [1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 6.0 15.3]
 [0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 7.0 15.0]]


Splitting the dataset into the Training set and Test set

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [30]:
print(X_train)

[[0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 2.0 28.0]
 [0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 38.0]
 [0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 5.0 16.8]
 [0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 4.0 19.4]
 [1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 6.0 15.3]]


In [31]:
print(X_test)

[[0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 7.0 15.0]
 [0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 3.0 24.2]]


In [32]:
print(Y_train)

['Yes' 'Yes' 'No' 'Yes' 'Yes']


In [33]:
print(Y_test)

['Yes' 'No']


Feature Scaling

In [34]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [35]:
print(X_train)

[[-0.5        -0.5         2.          0.         -0.5        -0.5
   0.         -0.81649658  0.81649658  0.         -0.86266219  0.53074305]
 [-0.5        -0.5        -0.5         0.         -0.5         2.
   0.          1.22474487 -1.22474487  0.         -1.40182605  1.71017204]
 [-0.5        -0.5        -0.5         0.          2.         -0.5
   0.          1.22474487 -1.22474487  0.          0.75482941 -0.79021743]
 [-0.5         2.         -0.5         0.         -0.5        -0.5
   0.         -0.81649658  0.81649658  0.          0.21566555 -0.48356589]
 [ 2.         -0.5        -0.5         0.         -0.5        -0.5
   0.         -0.81649658  0.81649658  0.          1.29399328 -0.96713177]]


In [36]:
print(X_test)

[[-0.5        -0.5        -0.5         1.         -0.5        -0.5
   0.         -0.81649658 -1.22474487  1.          1.83315714 -1.00251464]
 [-0.5        -0.5        -0.5         0.         -0.5        -0.5
   1.          1.22474487 -1.22474487  0.         -0.32349832  0.08256003]]
