**Marketing & Sales Data Exploration and Prediction**

In [None]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import linear_model

In [None]:
#import data
data = pd.read_csv('../input/dummy-advertising-and-sales-data/Dummy Data HSS.csv')
data.head()

Data Exploration

In [None]:
data.describe()

In [None]:
data.info()

1. Sum Budget of Each Channel

In [None]:
type = data[["TV", "Radio", "Social Media"]]
type.sum()

In [None]:
x = [i for i, _ in enumerate(type)]
y = type.sum()
plt.bar(x, y)
plt.xticks(x, type)

2. Correlation Between Each Channel and Sales

In [None]:
corr = data.corr()
sns.heatmap(corr, annot=True)
plt.show()

3. Dealing with Categorical Variable

In [None]:
#influencer correlation with sales
data.boxplot('Sales','Influencer',figsize=(5,6))

In [None]:
#frequency distribution 
data['Influencer'].value_counts()

Encoding Categorical Data

In [None]:
#one-hot encoding
data_onehot = pd.get_dummies(data, columns=['Influencer'], prefix = ['Influencer'])
data_onehot.head()

Sales Prediction

1. Data Cleaning

In [None]:
data_onehot.isnull().sum()

In [None]:
#drop null in sales
missing = data_onehot[data_onehot['Sales'].isnull()].index
data_onehot = data_onehot.drop(missing, axis=0)

In [None]:
data_onehot['Sales'].isnull().sum()

In [None]:
#fill with median
col = ['TV','Radio','Social Media']

for i in col:
    data_onehot[i].fillna(data_onehot[i].median(), inplace = True)

In [None]:
data_onehot.isnull().sum()

2. Data Transformation

In [None]:
#cek distribution for each variable
numeric_val = data_onehot["TV"]
sns.distplot(numeric_val)

In [None]:
sns.distplot(data_onehot["Radio"])

In [None]:
sns.distplot(data_onehot["Social Media"])

In [None]:
#transform social media data 
data_onehot['Social Media'] = np.sqrt(data_onehot['Social Media'])
sns.distplot(data_onehot["Social Media"])

3. Make Prediction Model

Start making a pediction model by divide dataset by training and testing data. We will use a Linear & Ridge Regression algorithm for sales prediction.

In [None]:
#divide into training and testing
X = np.asarray(data_onehot[['TV', 'Radio', 'Social Media', 'Influencer_Macro', 'Influencer_Micro', 'Influencer_Mega', 'Influencer_Nano']])
Y = np.asarray(data_onehot['Sales'])

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, shuffle= True)

In [None]:
#make model standard linear regression
lineReg = LinearRegression()
lineReg.fit(X_train, y_train)

lr_predict = lineReg.predict(X_test)
print('Score: ', lineReg.score(X_test, y_test))
print('Weights: ', lineReg.coef_)
print("Mean Squarred Error:", mean_squared_error(y_test, lr_predict))

In [None]:
#make model ridge regression
reg = linear_model.Ridge (alpha = .5)
reg.fit(X_train, y_train)

lr_predict = lineReg.predict(X_test)

print('Score: ', reg.score(X_test, y_test))
print('Weights: ', reg.coef_)
print("Mean Squarred Error:", mean_squared_error(y_test, lr_predict))


4. Test Prediction

In [None]:
lineReg.predict([[80,  20,  35,  1, 0, 0, 0]])