<a href="https://colab.research.google.com/github/seremmartin64-ops/ML/blob/main/Sales_Prediction_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Machine Learning Regression
# Regression: Used when predicting continous(numerical) outcome
# E,g Sales, Temperature, House Prices, Stock Prices.

# Steps follow:
# 1. Read Sales Data From Kaggle
# 2. Data Cleaning
# 3. Split Data into Inputs(X) and Output(Y)
# 4. Splitting Data into Training Sets(70%) and Testing Sets(30%)
# 5. Importing ML Algorithms(DecisionTree, KNeighbour, SVM) and Training
# 6. Evaluation to Determine Accuracy
# 7. Prediction

In [None]:
# STEP ONE: READING THE DATASET
import kagglehub
import pandas as pd
import os

# Download latest version
path = kagglehub.dataset_download("tawfikelmetwally/advertising-dataset")
print(path)
print(os.listdir(path))

file_path = os.path.join(path, "Advertising.csv")
data = pd.read_csv(file_path)
data

Using Colab cache for faster access to the 'advertising-dataset' dataset.
/kaggle/input/advertising-dataset
['Advertising.csv']


Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9
...,...,...,...,...,...
195,196,38.2,3.7,13.8,7.6
196,197,94.2,4.9,8.1,9.7
197,198,177.0,9.3,6.4,12.8
198,199,283.6,42.0,66.2,25.5


In [None]:
# Step2: Data Cleaning
# It ensures that we are working with a proper dataset
# "Garbage in Garbage Out"

# In the dataset above we have an unnecessary called Unnamed: 0
# Therefore it should be dropped off.
# ETL(Extract, Transform, Load)
# 1. Remove Mising Records
# 2. Remove Duplicates
# 3. Remove Unwanted Columns
# 4. Remove Outliers
# 5. Tranform Data Types(Encoding)

data.drop(columns=["Unnamed: 0"], inplace = True)

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,66.2,25.5


In [None]:
# Step3: Split Data into Inputs(Features) and Output(Outcome)
# Supervised Machine Learning: The Data has inputs and outputs
# Features - X
# Outcome - Y

array = data.values
array.shape
# Features -> X
X = array[:, 0:3]

# Outcome ->
Y = array[:, 3]


In [None]:
# Step4: Splitting Records(200), into Training(70%) and Testing(30%)
# Training Records -> 140
# Testing Records -> 60
# 1 = 0.3(test) + 0.7(training)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=42)

In [None]:
# Step5: How do we train ML models?
# What is Cross Validation?
# Scientific Algorithms(methods), have been developed to train models from data
# 1. DecisionTree
# 2. KNeighbours
# 3. LinearRegression
# 4. Support Vectors Machines
# 5. Random Forest
# 6. GradientBoosting

# ML: Scientific Algorithms:Limitation(Huge Dataset)
# DL: Neural Networks(Handle Huge Datasets)




# Import the Algorithms
# Decision Trees
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
model.fit(X_train, Y_train)

In [None]:
# Step6: Evaluation
# Process to Determine the Accuracy of a Machine Learning Model
# Regression: r2_score: MeanAbsoluteError


In [None]:
# Step7: Actaul Predictions: We apply the ML model to predict our sales
# Expenses for Adverts
adverts_expenditure = [[50, 50, 40]]
future_sales = model.predict(adverts_expenditure)
print(future_sales)

[10.4]


In [None]:
data.corr()

Unnamed: 0,TV,Radio,Newspaper,Sales
TV,1.0,0.054809,0.056648,0.782224
Radio,0.054809,1.0,0.354104,0.576223
Newspaper,0.056648,0.354104,1.0,0.228299
Sales,0.782224,0.576223,0.228299,1.0
