# Data Preprocessing

### Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Reading the dataset

In [None]:
dataset = pd.read_csv('marketing.csv')
dataset

In [None]:
dataset.describe()

In [None]:
# Separate Independent variables (X) and dependent variable (y)

X = dataset.iloc[:, 0:3].values
y = dataset.iloc[:, 3].values

print(X,'\n\n', y)

### Missing values

In [None]:
# Replace missing values by the column/variable average (mean)
# Import Class --> Create Object --> Fit Object to Data --> Transform Data

from sklearn.impute import SimpleImputer                            #import the SimpleImputer class
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')     #create the imputer object

imputer.fit(X[:, 1:])                          #fit the object to the data
X[:, 1:] = imputer.transform(X[:, 1:])        #transform data

print(X)

### Encoding categorical variables

In [None]:
# Encoding independent variables
# Import Class --> Create Object --> Fit Object to Data --> Transform Data

from sklearn.compose import ColumnTransformer       # import class
from sklearn.preprocessing import OneHotEncoder     # import class

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop = 'first'), [0])], remainder='passthrough')    
#create object

X = np.array(ct.fit_transform(X))                   # fit object to data and transform data

print(X)

In [None]:
# Encoding dependent variable
# Import Class --> Create Object --> Fit Object to Data --> Transform Data

from sklearn.preprocessing import LabelEncoder   #import class
le = LabelEncoder()                              #create object
y = le.fit_transform(y)                          #fit and transform

print(y)

### Feature scaling

In [None]:
# Adjusting the scales of independent variables 
# Import Class --> Create Object --> Fit Object to Data --> Transform Data

from sklearn.preprocessing import StandardScaler   # import class
sc = StandardScaler()                              # create object
X = sc.fit_transform(X)                            # fit and transform 

print(X)

In [None]:
df = pd.DataFrame(X)
df.describe()

### Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
print(X_train)

# Classification

### Training the Model

In [None]:
# We use linear regression. 
# Can also use polynomial, support vector, decision tree or random forest regression.  

# Import Class --> Create Object --> Fit Object to Data --> Predict

from sklearn.linear_model import LogisticRegression   # import class
classifier = LogisticRegression()                     # create object
classifier.fit(X_train, y_train)                      # fit object

### Testing the Model

In [None]:
# make predictions for test data

y_pred = classifier.predict(X_test)    # predict

err = y_pred - y_test

df = pd.DataFrame({'y_pred': y_pred, 'y_test': y_test, 'error': err})     # compare

#pd.set_option('display.max_rows', 80)
print(df)

In [None]:
# Counting the errors
print('False positives: ', list(err).count(1))
print('False negatives: ', list(err).count(-1))

# accuracy = (total - number of errors) x 100 / total
print('Accuracy = ', (80 - list(err).count(1) - list(err).count(1))*100/80, '%')

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

## Graphing

In [None]:
import seaborn as sns
sns.heatmap(cm, annot=True, center = 100)

### Forecasting (Optional)

In [None]:
# Predict if a 28 year old male with 50,000 salary would make the purchase
# What happens if you change the salary to 200,000?

Z = sc.transform([[1, 28, 50000]])       # scale data by .transform() method but NOT .fit_transform()

prediction = classifier.predict(Z)
print(Z, prediction)