In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../input/dummy-advertising-and-sales-data/Dummy Data HSS.csv')

In [None]:
df

In [None]:
df.info()

In [None]:
xdf = df.copy()

## Exploratory Data Analysis

In [None]:
xdf['Id'] = xdf.index + 1

## Check Missing Values

In [None]:
xdf.isnull().sum()

In [None]:
# listing them all

total = xdf.isnull().sum().sort_values(ascending = False)
percent = ((xdf.isnull().sum() / xdf.shape[0]) * 100).sort_values(ascending = False)
percent = np.round(percent, 3)
types = xdf[percent.index].dtypes

missing_data = pd.concat([total, percent, types], axis = 1, keys = ["Total","Percent","Type"])
missing_data.head(5)

### Imputing Missing Values

'Sales' is the target variable, we don't want to keep a random variable there, so we will drop them. However, 'TV','Social Media','Radio' are numerical attribute. We will use 'median' to impute the missing values.


In [None]:
# drop na rows from 'Sales'

missing_rows = xdf[xdf['Sales'].isna()].index
xdf = xdf.drop(missing_rows, axis = 0).reset_index(drop = True)

In [None]:
# imputing 'median' on other attributes

cols = ['TV','Radio','Social Media']

for i in cols:
    xdf[i].fillna(xdf[i].median(), inplace = True)

In [None]:
xdf.isnull().sum()

## Univariate Analysis (Target Attribute)

In [None]:
df['Sales'].describe()

In [None]:
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# distribution plot

sns.displot(x = 'Sales', data = df, aspect = 2, height = 6, kde = True);

## Univariate Analysis (Independent Attributes)

In [None]:
numerical_df = df.select_dtypes(exclude = 'object')

In [None]:
for i in numerical_df:
    sns.displot(x = i, data = df, aspect = 2, height = 6, kde = True);

- Social Media is left skewed.

In [None]:
# Scatterplot

for i in numerical_df:
    plt.figure(figsize = (8,6))
    sns.scatterplot(x = i, y = 'Id', data = xdf);

In [None]:
# Boxplots

for i in numerical_df:
    plt.figure(figsize = (8,6))
    sns.boxplot(x = i, data = xdf);

In [None]:
# Let's fix the distribution of 'SocialMedia'
# Since it's left skewed, we will use sqrt transformation

def sqrt_transform(data):
    return np.sqrt(data)



In [None]:
xdf['Social Media'] = xdf['Social Media'].map(sqrt_transform)

In [None]:
# Let's confirm it

sns.displot(x = 'Social Media', data = xdf, kde = True);

# Mutlivariate Analysis 

Let's check all the numerical variables in term of 'Sales'

In [None]:
#scatter plot

for i in numerical_df:
    plt.figure(figsize = (8,6))
    sns.scatterplot(y = 'Sales', x = i, data = xdf);

All of them are highly postively correlated with 'Sales'.

### Analysis Categorical Attribute

In [None]:
xdf['Influencer'].value_counts()

In [None]:
# let's see it's role in Sales

plt.figure(figsize = (8,6))
sns.violinplot(x = 'Influencer', y = 'Sales', data = xdf);

Every one has almost equal significance on 'Sales'

## Dataset Preparation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
X = xdf.drop('Sales', axis = 1)
y = xdf['Sales']

### Encoding

In [None]:
X = pd.get_dummies(X)

### Scaling Dataset

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X1 = scaler.fit_transform(X)
X = pd.DataFrame(data = X1, columns = X.columns)

In [None]:
X.head()

### Splitting Dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


# Modeling and Evaluation Metrics

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import math

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression(normalize = True)
lr.fit(X_train, y_train)

In [None]:
lr_predict = lr.predict(X_test)

In [None]:
yp = lr.predict(X_test)
print("R2 Score:", r2_score(y_test, lr_predict))
print("Mean Squarred Error:", mean_squared_error(y_test, lr_predict))
print("RMSE:", math.sqrt(mean_squared_error(y_test, lr_predict)))
print("Mean Absolute Error : " + str(mean_absolute_error(y_test,lr_predict)))

### XGBoost Regressor

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators = 1000, learning_rate = 0.05)
xgb.fit(X_train, y_train)

predict = xgb.predict(X_test)

In [None]:
print("R2 Score:", r2_score(y_test, predict))
print("Mean Squarred Error:", mean_squared_error(y_test, predict))
print("RMSE:", math.sqrt(mean_squared_error(y_test, predict)))
print("Mean Absolute Error : " + str(mean_absolute_error(y_test,predict)))