# Data Preprocessing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing Dataset

In [2]:
data_set = pd.read_csv('data/zomato_reviews.csv')
X = data_set.iloc[:, :-1].values
y = data_set.iloc[:, -1].values

In [3]:
print(X)

[[   0    5]
 [   1    5]
 [   2    4]
 ...
 [5476    5]
 [5477    1]
 [5478    1]]


In [4]:
print(y)

['nice'
 'best biryani , so supportive staff of outlet , personalize my order on call as I say. full Paisa vasool '
 'delivery boy was very decent and supportive.👌👍' ...
 'took for an hour to prepare 3 khawsa, which in real life gets prepared in 5 mins, because its just a morning snack soup. '
 'very very late, littrally did time pass and items not proper'
 'Taste was stale and they give only 5 pieces in 50 Rs. Please do not cheat the online customers.']





## Handling Missing Data

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

imputer.fit(X[:, 0:1])
X[: , 0:1] = imputer.transform(X[:, 0:1])

In [6]:
print(X)

[[   0    5]
 [   1    5]
 [   2    4]
 ...
 [5476    5]
 [5477    1]
 [5478    1]]


In [7]:
print(X)

[[   0    5]
 [   1    5]
 [   2    4]
 ...
 [5476    5]
 [5477    1]
 [5478    1]]


## Encoding Categorical Data

### Encoding independent variables

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [9]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [1])], remainder= 'passthrough')
X = np.array(ct.fit_transform(X))

In [10]:
print(X)

[[0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 1.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 2.000e+00]
 ...
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 5.476e+03]
 [1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 5.477e+03]
 [1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 5.478e+03]]


### Encoding dependent variables

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [12]:
print(y)

[2802 1919 2084 ... 3783 4010 1294]


## Splitting data into Test set & Training Set


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [14]:
print(X_train)

[[0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 2.572e+03]
 [0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 2.078e+03]
 [0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 1.034e+03]
 ...
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 5.226e+03]
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 5.390e+03]
 [1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 8.600e+02]]


In [15]:
print(X_test)

[[1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 3.166e+03]
 [0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 3.400e+03]
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 8.000e+00]
 ...
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 1.438e+03]
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 5.261e+03]
 [0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 4.599e+03]]


In [16]:
print(y_train)

[2434  532 2091 ... 4006  439 2802]


In [17]:
print(y_test)

[ 314 2147 3536 ... 4099 2283   50]


## Feature Scaling

In [18]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 6:] = sc.fit_transform(X_train[:, 6:])
X_test[:, 6:] = sc.fit_transform(X_test[:, 6:])

ValueError: Found array with 0 feature(s) (shape=(4383, 0)) while a minimum of 1 is required by StandardScaler.

In [63]:
print(X_train)

[[1.0 0.0 0.0 86.0 'yes' 'yes' -0.9941858494300262]
 [0.0 1.0 0.0 35.0 'no' 'yes' 0.9574354145081888]
 [1.0 0.0 0.0 75.0 'no' 'yes' -0.9012515035282065]
 [1.0 0.0 0.0 42.0 'no' 'no' 0.9574354145081888]
 [1.0 0.0 0.0 74.0 'yes' 'yes' -0.6224484658227473]
 [1.0 0.0 0.0 62.0 'yes' 'yes' -1.2729888871354855]
 [0.0 1.0 0.0 24.0 'no' 'no' 0.9574354145081888]
 [0.0 1.0 0.0 48.0 'no' 'yes' 1.0503697604100086]
 [1.0 0.0 0.0 58.0 'no' 'no' -1.0871201953318461]
 [0.0 1.0 0.0 45.130434782608695 'yes' 'no' 0.7715667227045493]
 [0.0 1.0 0.0 15.0 'no' 'no' 0.9574354145081888]
 [0.0 0.0 1.0 70.0 'no' 'yes' 0.5856980309009098]
 [1.0 0.0 0.0 52.0 'yes' 'yes' -0.1577767363136484]
 [1.0 0.0 0.0 60.0 'yes' 'yes' -1.2729888871354855]
 [1.0 0.0 0.0 54.0 'yes' 'yes' -1.0871201953318461]
 [0.0 0.0 1.0 29.0 'no' 'no' 1.0503697604100086]
 [1.0 0.0 0.0 78.0 'yes' 'yes' -0.9941858494300262]
 [1.0 0.0 0.0 36.0 'yes' 'no' 0.7715667227045493]
 [0.0 1.0 0.0 30.0 'yes' 'no' 0.3998293390972702]
 [1.0 0.0 0.0 50.0 'yes' 