In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
file=pd.read_csv("../input/direct-marketing/DirectMarketing.csv",sep=',')

In [None]:
file.shape
file.head()

In [None]:
#Load the data and check for NA in the file.
file.isna().sum()

In [None]:
#Column History has 303 NA's. Let us now check the values in that column
file['History'].value_counts()

In [None]:
file['History'].value_counts(normalize = True)

In [None]:
#Filter all the rows with NA, to check if there are any anomalies or to identify the way to replace the NA.
file[file.isnull().any(axis=1)]

In [None]:
#Fill the NA in column "History" with the help of column 'Age'. Group the columns Age and History and identify the value 
#which occurs the most for each category in Age. Replace the NA in column 'History' based on the values obtained after 
#grouping the data

file['History'] = file.groupby(['Age'], sort=False)['History'].apply(lambda x: x.fillna(x.mode().iloc[0]))

In [None]:
#Univariate Analysis
import matplotlib.pyplot as plt
plt.figure(1)
plt.subplot(2,3,1)
file['Age'].value_counts(normalize=True).plot.bar(figsize=(20,10), title= 'Age') 
plt.subplot(2,3,2)
file['Gender'].value_counts(normalize=True).plot.bar(title= 'Gender') 
plt.subplot(2,3,3)
file['Married'].value_counts(normalize=True).plot.bar(title= 'Married')
plt.subplot(2,3,4)
file['OwnHome'].value_counts(normalize=True).plot.bar(title= 'Own Home')
plt.subplot(2,3,5)
file['Location'].value_counts(normalize=True).plot.bar(title= 'Location')
plt.subplot(2,3,6)
file['History'].value_counts(normalize=True).plot.bar(title= 'Histoy') 
plt.show()

In [None]:
import seaborn as sns

plt.figure(1)
plt.subplot(121)
sns.distplot(file['Salary'])
plt.subplot(122)
file['Salary'].plot.box(figsize = (16,5))


In [None]:
plt.figure(1)
plt.subplot(121)
sns.distplot(file['Children'])
plt.subplot(122)
file['Children'].plot.box(figsize = (16,5))


In [None]:
plt.figure(1)
plt.subplot(121)
sns.distplot(file['Catalogs'])
plt.subplot(122)
file['Catalogs'].plot.box(figsize = (16,5))


In [None]:
plt.figure(1)
plt.subplot(121)
sns.distplot(file['AmountSpent'])
plt.subplot(122)
file['AmountSpent'].plot.box(figsize = (16,5))


In [None]:
#Bivariate Analysis
#Age

#Let us now view the column 'Age' in detail to identify the amount spent details.

pd.pivot_table(file, index = ['Age','History'], values = 'AmountSpent', aggfunc=np.sum)

In [None]:
pd.pivot_table(file, index = ['Age','History'], values = 'AmountSpent', aggfunc=[np.mean,len])

In [None]:
#Gender

#Gender is used for analysis of Amount spent. There is not much difference on the average spent by gender. 
#However, on further analysis, we could see that the number of customers with Low History is more in the female than male and 
#High History is more in male that female.
pd.pivot_table(file, index = ['Gender', 'History'], values = 'AmountSpent', aggfunc=np.sum)

In [None]:
pd.pivot_table(file, index = ['Gender', 'History'], values = 'AmountSpent', aggfunc=[np.mean,len])

In [None]:
#Own home
#1.Customers with own home are more in category High and their average spending is high too. 
#2.Average spending is in same level for Low and Medium categories.

pd.pivot_table(file, index = ['OwnHome', 'History'], values = 'AmountSpent', aggfunc=np.sum)

In [None]:
pd.pivot_table(file, index = ['OwnHome', 'History'], values = 'AmountSpent', aggfunc=[np.mean,len])

In [None]:
#Married
#1.Married people are more in category High 
#2.Single customers are more in category Low.

pd.pivot_table(file, index = ['Married', 'History'], values = 'AmountSpent', aggfunc=np.sum)

In [None]:
pd.pivot_table(file, index = ['Married', 'History'], values = 'AmountSpent', aggfunc=[np.mean,len])

In [None]:
#Location
#1.Customers who are in close by location are more in number. 
#2.Average spending is high with respect to customers who are in far locations

pd.pivot_table(file, index = ['Location', 'History'], values = 'AmountSpent', aggfunc=np.sum)

In [None]:
pd.pivot_table(file, index = ['Location', 'History'], values = 'AmountSpent', aggfunc=[np.mean,len])

In [None]:
#Preprocessing data

#Convert all the categorical columns to numerical columns using labelencoder

from sklearn.preprocessing import LabelEncoder

cat_cols = ['Age','Gender', 'OwnHome','Married','Location','History']

for c in cat_cols:
    file[c]= LabelEncoder().fit_transform(file[c])

In [None]:
sns.pairplot(file)

In [None]:
#Relationship between Amount spent and other fields.

#1.Salary = 0.7
#2.Catalogs = 0.5
#3.History = -0.5
#4.Married = -0.5
#5.Own Home = -0.4
#6.Age = -0.4

t = file.corr()

sns.heatmap(t, cmap='coolwarm', annot=True, fmt=".1f")



In [None]:
#Split the data into train and test

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X= file.drop('AmountSpent', axis =1)
y = file['AmountSpent']

scale = StandardScaler().fit_transform(X)
X = pd.DataFrame(scale, columns=X.columns)

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2)

def model_details(model,name,X_train = X_train,y_train = y_train,X_test = X_test,y_test = y_test):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    table = pd.DataFrame({'Model' :[name],
                          'RMSE' :[np.sqrt(mean_squared_error(y_test,y_pred))],
                          })

    return table

In [None]:
### Model Building

from sklearn.linear_model import LinearRegression
model_LR = LinearRegression()
model_LR.fit(X_train,y_train)

model_LR_predict = model_LR.predict(X_test)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)

X_train_poly_2 = poly_reg.fit_transform(X_train)
X_test_poly_2 = poly_reg.fit_transform(X_test)
model_PR_2 = LinearRegression()
model_PR_2.fit(X_train_poly_2,y_train)

model_PR_2_predict = model_PR_2.predict(X_test_poly_2)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 3)

X_train_poly_3 = poly_reg.fit_transform(X_train)
X_test_poly_3 = poly_reg.fit_transform(X_test)
model_PR_3 = LinearRegression()
model_PR_3.fit(X_train_poly_3,y_train)

model_PR_3_predict = model_PR_3.predict(X_test_poly_3)

In [None]:
from sklearn.svm import SVR
model_SVR = SVR(kernel = 'rbf')

model_SVR.fit(X_train, y_train)
model_SVR_predict = model_SVR.predict(X_test)

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor

model_DT = DecisionTreeRegressor()
model_DT.fit(X_train, y_train)

model_DT_predict = model_DT.predict(X_test)

importance = model_DT.feature_importances_
feat_imp = pd.DataFrame({'Columns': X.columns,
                         'Imp' : importance})
feat_imp

In [None]:
from sklearn.ensemble import RandomForestRegressor
model_RF = RandomForestRegressor(n_estimators = 100, random_state = 111)
model_RF.fit(X_train, y_train)

model_RF_predict = model_RF.predict(X_test)

In [None]:
model_1 = model_details(model_LR, 'Linear Regression')
model_2 = model_details(model_PR_2, 'Polynomial Regression (2)',X_train = X_train_poly_2, X_test = X_test_poly_2)
model_3 = model_details(model_PR_3, 'Polynomial Regression (3)',X_train = X_train_poly_3, X_test = X_test_poly_3)
model_4 = model_details(model_SVR, 'SVR')
model_5 = model_details(model_DT, 'Decision Tree')
model_6 = model_details(model_RF, 'Random Forest')

final_table = pd.concat([model_1,model_2,model_3, model_4, model_5, model_6], axis =0)
final_table