## Import Libraries

In [1]:
## Import Libraries
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Load data

In [12]:
## Read "UniversalBank.csv" using pandas
cust = pd.read_csv("CustomerData.csv")

In [13]:
## Print the first 7 rows
cust.head(7)

Unnamed: 0,CustomerID,City,NoOfChildren,MinAgeOfChild,MaxAgeOfChild,Tenure,FrquncyOfPurchase,NoOfUnitsPurchased,FrequencyOFPlay,NoOfGamesPlayed,NoOfGamesBought,FavoriteChannelOfTransaction,FavoriteGame,Revenue
0,1001.0,1.0,2.0,3.0,8,210,11,11,2344.0,108.0,10.0,Uniform,Uniform,107.51
1,1002.0,1.0,2.0,3.0,6,442,20,20,245.0,22.0,7.0,Favorite,Uniform,382.4
2,1003.0,1.0,4.0,3.0,5,424,18,18,1059.0,130.0,18.0,Favorite,Uniform,135.01
3,1004.0,1.0,1.0,6.0,6,261,11,9,365.0,34.0,11.0,Favorite,Uniform,125.0
4,1005.0,1.0,3.0,6.0,9,422,44,31,1066.0,102.0,44.0,Uniform,Uniform,335.05
5,1006.0,1.0,2.0,3.0,4,378,16,16,228.0,12.0,16.0,Favorite,Favorite,150.0
6,1007.0,1.0,3.0,8.0,12,369,25,15,75.0,2.0,25.0,Favorite,Favorite,127.5


In [14]:
## Check the datatype of each variable
cust.dtypes

CustomerID                      float64
City                            float64
NoOfChildren                    float64
MinAgeOfChild                   float64
MaxAgeOfChild                     int64
Tenure                            int64
FrquncyOfPurchase                 int64
NoOfUnitsPurchased                int64
FrequencyOFPlay                 float64
NoOfGamesPlayed                 float64
NoOfGamesBought                 float64
FavoriteChannelOfTransaction     object
FavoriteGame                     object
Revenue                         float64
dtype: object

## Pre-Processing

In [15]:
## Drop columns which are not significant
cust.drop(["CustomerID"],axis=1,inplace=True)

In [16]:
cust.head()

Unnamed: 0,City,NoOfChildren,MinAgeOfChild,MaxAgeOfChild,Tenure,FrquncyOfPurchase,NoOfUnitsPurchased,FrequencyOFPlay,NoOfGamesPlayed,NoOfGamesBought,FavoriteChannelOfTransaction,FavoriteGame,Revenue
0,1.0,2.0,3.0,8,210,11,11,2344.0,108.0,10.0,Uniform,Uniform,107.51
1,1.0,2.0,3.0,6,442,20,20,245.0,22.0,7.0,Favorite,Uniform,382.4
2,1.0,4.0,3.0,5,424,18,18,1059.0,130.0,18.0,Favorite,Uniform,135.01
3,1.0,1.0,6.0,6,261,11,9,365.0,34.0,11.0,Favorite,Uniform,125.0
4,1.0,3.0,6.0,9,422,44,31,1066.0,102.0,44.0,Uniform,Uniform,335.05


In [17]:
## Convert Categorical Columns to Dummies
cat_cols = ["City","FavoriteChannelOfTransaction","FavoriteGame"]
cust = pd.get_dummies(cust,columns=cat_cols,drop_first=True,)

In [45]:
## Split the data into X and y
X = cust.copy().drop("Revenue",axis=1)
y = cust["Revenue"]

In [46]:
## Split the data into X_train, X_test, y_train, y_test with test_size = 0.20 using sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [47]:
## Print the shape of X_train, X_test, y_train, y_test
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2567, 13)
(642, 13)
(2567,)
(642,)


In [48]:
## check for null values
X_train.isnull().sum()

NoOfChildren                            1
MinAgeOfChild                           2
MaxAgeOfChild                           0
Tenure                                  0
FrquncyOfPurchase                       0
NoOfUnitsPurchased                      0
FrequencyOFPlay                         2
NoOfGamesPlayed                         4
NoOfGamesBought                         2
City_2.0                                0
FavoriteChannelOfTransaction_Uniform    0
FavoriteGame_NONE                       0
FavoriteGame_Uniform                    0
dtype: int64

In [49]:
X_test.isnull().sum()

NoOfChildren                            0
MinAgeOfChild                           0
MaxAgeOfChild                           0
Tenure                                  0
FrquncyOfPurchase                       0
NoOfUnitsPurchased                      0
FrequencyOFPlay                         0
NoOfGamesPlayed                         0
NoOfGamesBought                         1
City_2.0                                0
FavoriteChannelOfTransaction_Uniform    0
FavoriteGame_NONE                       0
FavoriteGame_Uniform                    0
dtype: int64

In [52]:
from sklearn.preprocessing import Imputer
imputer = Imputer()
imputer.fit(X=X_train)
X_train.iloc[:,:] = imputer.transform(X=X_train.iloc[:,:])
X_test.iloc[:,:] = imputer.transform(X=X_test.iloc[:,:])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [53]:
## Scale the numeric attributes
scaler = StandardScaler()
scaler.fit(X_train.iloc[:,:9])

X_train.iloc[:,:9] = scaler.transform(X_train.iloc[:,:9])
X_test.iloc[:,:9] = scaler.transform(X_test.iloc[:,:9])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Model Building

In [54]:
## Build a SVM Regressor
from sklearn.svm import SVR

## Create an SVR object and print it to see the default arguments
svr = SVR()
svr

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [55]:
## Create a new SVR object with name "svr_c10_rbf" with C = 10 and kernel = "rbf"

svr_c10_rbf = SVR(C=10,kernel='rbf')
svr_c10_rbf

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [56]:
## Fit the model svr_c10_rbf on the train data (X_train,y_train)
svr_c10_rbf.fit(X = X_train,y = y_train)



SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [57]:
## Predict on test data and store it in the variable y_pred
y_pred = svr_c10_rbf.predict(X_test)

In [59]:
## Evalutaion
from sklearn.metrics import mean_squared_error

In [60]:
## Print mse using y_test and y_pred
mean_squared_error(y_test,y_pred)

2733.6916054899816

## Parameter Tuning

In [61]:
## Use Grid Search for parameter tuning

from sklearn.model_selection import GridSearchCV

svr_grid = SVR()
 

param_grid = {

'C': [0.001, 0.01, 0.1, 1, 10],
'gamma': [0.001, 0.01, 0.1, 1], 
'kernel':['linear', 'poly', 'rbf', 'sigmoid']}

 
svr_cv_grid = GridSearchCV(estimator = svr_grid, param_grid = param_grid, cv = 10)

In [None]:
## Fit the grid search model
svr_cv_grid.fit(X = X_train, y = y_train)

In [25]:
## Print best score and parameters
print(svr_cv_grid.best_score_,svr_cv_grid.best_params_)

0.98525 {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
