Predict the number of unit sales given an amount of money spent on radio advertising

In [97]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

df = pd.read_csv('Advertising.csv')
x_arr = df["radio"].to_numpy()
y_arr = df["sales"].to_numpy()
x_arr = x_arr.reshape(-1,1)

#print('X', x_arr)
#print('Y', y_arr)

# Linear Regression model
Lin_regression = LinearRegression().fit(x_arr,y_arr)        
print('Score (Coefficient of determination of the prediction): ',Lin_regression.score(x_arr,y_arr))

estimated_sales = Lin_regression.predict([[23]])

print('Coefficient (w): ', Lin_regression.coef_)
print('Intercept (b): ', Lin_regression.intercept_)
print("Predicted Sales:", estimated_sales[0])

Score (Coefficient of determination of the prediction):  0.33203245544529536
Coefficien (w):  [0.20249578]
Intercept (b):  9.311638095158285
Predicted Sales: 13.969041113184396


In the example:
• alpha: 0.001
• epoch: 15000 loss: 18.092398515918592
• w, b: 0.20254581129464883 9.310003218435126

Where w is the slope, in this case the coefficient.

Predict the presence of cancer based on the processed medical image data in the breast cancer dataset

In [137]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
import numpy as np

results = []

cancer = load_breast_cancer()                                   # loading the breast cancer data
#print(cancer.feature_names)

for seed in range(100000, 100010):

    #print('Seed value: ', seed)
    rng = np.random.default_rng(seed)                               # random number generator
    idx_feat = (np.floor(30*rng.uniform(size=4))).astype(int)       # randomly select 4 features
    X = cancer["data"][:,idx_feat]                                  # load 2D array of selected features
    y = cancer["target"]                                            # load classification labels (0 = malignant tumour; 1 = benign tumor)
    Log_regression = LogisticRegression().fit(X, y)                 # Logistic Regression model

    # Model accuracy
    score = Log_regression.score(X, y)
    #print('Feature names: ',X)
    #print('Score: ', score)

    results.append((seed, cancer.feature_names[idx_feat], score))

#print(results)                              # Print to debug code
results.sort(key=lambda score: score[2])    # Sort by score, the score index is 2

# Extract the two best and two worst combinations
two_best = results[-2:]  # Last two entries have highest scores
two_best.sort(reverse=True)
two_worst = results[:2]  # First two entries have lowest scores

print('Two Best:')
for result in two_best:
    seed, features, score = result
    print('Seed: ', seed)
    print('Features: ', features)
    print('Score: ', score) 

print()
print('Two Worst:')
for result in  two_worst:
    seed, features, score = result
    print('Seed: ', seed)
    print('Features: ', features)
    print('Score: ', score) 

Two Best:
Seed:  100003
Features:  ['mean radius' 'worst perimeter' 'concave points error'
 'compactness error']
Score:  0.9402460456942003
Seed:  100002
Features:  ['worst symmetry' 'worst radius' 'symmetry error' 'perimeter error']
Score:  0.9209138840070299

Two Worst:
Seed:  100009
Features:  ['smoothness error' 'concave points error' 'mean compactness'
 'mean smoothness']
Score:  0.6942003514938488
Seed:  100008
Features:  ['worst fractal dimension' 'compactness error' 'radius error'
 'mean fractal dimension']
Score:  0.8031634446397188


Adding more features (8 for example)

In [122]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
import numpy as np

results = []

for seed in range(100000, 100010):

    #print('Seed value: ', seed)
    rng = np.random.default_rng(seed)                               # random number generator
    idx_feat = (np.floor(30*rng.uniform(size=8))).astype(int)      # randomly select 4 features
    cancer = load_breast_cancer()                                   # loading the breast cancer data
    X = cancer["data"][:,idx_feat]                                  # load 
    y = cancer["target"]                                            # load classification labels (0 = malignant tumour: 1 = benign tumor)
    Log_regression = LogisticRegression(max_iter=10000, solver='lbfgs').fit(X, y)   # Logistic Regression model (limited to 10000 iterations for conversion)

    # Model accuracy
    score = Log_regression.score(X, y)
    #print('Feature names: ',X)
    #print('Score: ', score)

    results.append((seed, cancer["feature_names"][idx_feat], score))

#print(results)                              # Print to debug code
results.sort(key=lambda score: score[2])    # Sort by score, the score index is 2

#print(results)
# Extract the two best and two worst combinations
two_best = results[-2:]  # Last two entries have highest scores
two_best.sort(reverse=True)
two_worst = results[:2]  # First two entries have lowest scores

for result in two_best + two_worst:
    seed, features, score = result
    print('Seed: ', seed)
    print('Features: ', features)
    print('Score: ', score) 

Seed:  100006
Features:  ['symmetry error' 'concave points error' 'mean concavity'
 'worst perimeter' 'perimeter error' 'worst area' 'compactness error'
 'mean concavity']
Score:  0.9244288224956063
Seed:  100003
Features:  ['mean radius' 'worst perimeter' 'concave points error'
 'compactness error' 'fractal dimension error' 'concavity error'
 'worst area' 'symmetry error']
Score:  0.9402460456942003
Seed:  100009
Features:  ['smoothness error' 'concave points error' 'mean compactness'
 'mean smoothness' 'mean fractal dimension' 'mean symmetry'
 'mean fractal dimension' 'worst symmetry']
Score:  0.7346221441124781
Seed:  100001
Features:  ['worst compactness' 'worst concavity' 'mean texture' 'mean compactness'
 'mean concavity' 'mean fractal dimension' 'compactness error'
 'texture error']
Score:  0.8453427065026362


Predict diabetes progression

In [145]:
from sklearn.datasets import load_diabetes
from sklearn.tree import DecisionTreeRegressor
import numpy as np

results = []

diabetes = load_diabetes()                                      # loading the diabetes data
#print(diabetes.DESCR)
#print(diabetes.feature_names)

for seed in range(100000, 100010):

    #print('Seed value: ', seed)
    rng = np.random.default_rng(seed)                               # random number generator
    idx_feat = (np.floor(10*rng.uniform(size=4))).astype(int)       # randomly select 4 features, in this dataset there are 10 features in total
    #print(idx_feat)
    X = diabetes["data"][:,idx_feat]                                # load selected features data 
    y = diabetes["target"]                                          # load classification labels (0 = malignant tumour: 1 = benign tumor)
    DTR_algorithm = DecisionTreeRegressor().fit(X, y)               # Logistic Regression model

    # Model accuracy
    score = DTR_algorithm.score(X, y)
    #print('Feature names: ',X)
    #print('Score: ', score)
    
    f_names = []
    for i in idx_feat:
        f_names.append(diabetes.feature_names[i])

    results.append((seed, f_names, score))

#print(results)                              # Print to debug code
results.sort(key=lambda score: score[2])    # Sort by score, the score index is 2

# Extract the two best and two worst combinations
two_best = results[-2:]  # Last two entries have highest scores
two_best.sort(reverse=True)
two_worst = results[:2]  # First two entries have lowest scores


print('Two Best:')
for result in two_best:
    seed, features, score = result
    print('Seed: ', seed)
    print('Features: ', features)
    print('Score: ', score) 

print()
print('Two Worst:')
for result in  two_worst:
    seed, features, score = result
    print('Seed: ', seed)
    print('Features: ', features)
    print('Score: ', score)

Two Best:
Seed:  100008
Features:  ['s6', 's2', 'bp', 'bp']
Score:  1.0
Seed:  100007
Features:  ['bmi', 's6', 'age', 'bmi']
Score:  1.0

Two Worst:
Seed:  100004
Features:  ['age', 's3', 'age', 's3']
Score:  0.9224163008676227
Seed:  100001
Features:  ['s5', 's5', 'age', 'sex']
Score:  0.9920437133142372


SVM

In [142]:
from sklearn.datasets import load_breast_cancer
from sklearn.svm import LinearSVC
import numpy as np

results = []

cancer = load_breast_cancer()                                       # loading the breast cancer data
#print(cancer.feature_names)

for seed in range(100000, 100010):

    #print('Seed value: ', seed)
    rng = np.random.default_rng(seed)                               # random number generator
    idx_feat = (np.floor(30*rng.uniform(size=4))).astype(int)       # randomly select 4 features
    X = cancer["data"][:,idx_feat]                                  # load 2D array of selected features
    y = cancer["target"]                                            # load classification labels (0 = malignant tumour: 1 = benign tumor)
    l_SVC = LinearSVC(max_iter=10000, dual=False, random_state=seed).fit(X, y)                 # Logistic Regression model

    # Model accuracy
    score = l_SVC.score(X, y)

    results.append((seed, cancer.feature_names[idx_feat], score))

#print(results)                              # Print to debug code
results.sort(key=lambda score: score[2])    # Sort by score, the score index is 2

# Extract the two best and two worst combinations
two_best = results[-2:]  # Last two entries have highest scores
two_best.sort(reverse=False)
two_worst = results[:2]  # First two entries have lowest scores

print('Two Best:')
for result in two_best:
    seed, features, score = result
    print('Seed: ', seed)
    print('Features: ', features)
    print('Score: ', score) 

print()
print('Two Worst:')
for result in  two_worst:
    seed, features, score = result
    print('Seed: ', seed)
    print('Features: ', features)
    print('Score: ', score)

Two Best:
Seed:  100002
Features:  ['worst symmetry' 'worst radius' 'symmetry error' 'perimeter error']
Score:  0.9420035149384886
Seed:  100003
Features:  ['mean radius' 'worst perimeter' 'concave points error'
 'compactness error']
Score:  0.9402460456942003

Two Worst:
Seed:  100009
Features:  ['smoothness error' 'concave points error' 'mean compactness'
 'mean smoothness']
Score:  0.7750439367311072
Seed:  100008
Features:  ['worst fractal dimension' 'compactness error' 'radius error'
 'mean fractal dimension']
Score:  0.8084358523725835


Nearest Neighbors

In [153]:
from sklearn.datasets import load_diabetes
from sklearn.neighbors import KNeighborsRegressor
import numpy as np

results = []

diabetes = load_diabetes()                                      # loading the diabetes data
#print(diabetes.feature_names)

for seed in range(100000, 100010):

    rng = np.random.default_rng(seed)                               # random number generator
    idx_feat = (np.floor(10*rng.uniform(size=4))).astype(int)       # randomly select 4 features, in this dataset there are 10 features in total
    X = diabetes["data"][:,idx_feat]                                # load selected features data
    y = diabetes["target"]                                          # load classification labels (0 = malignant tumour: 1 = benign tumor)
    kNN_algorithm = KNeighborsRegressor().fit(X, y)                  # KNeighbors Regression model

    # Model accuracy
    score = kNN_algorithm.score(X, y)
    #print('Feature names: ',X)
    #print('Score: ', score)
    
    f_names = []
    for i in idx_feat:
        f_names.append(diabetes.feature_names[i])

    results.append((seed, f_names, score))

#print(results)                              # Print to debug code
results.sort(key=lambda score: score[2])    # Sort by score, the score index is 2

# Extract the two best and two worst combinations
two_best = results[-2:]  # Last two entries have highest scores
two_best.sort(reverse=True)
two_worst = results[:2]  # First two entries have lowest scores

print('Two Best:')
for result in two_best:
    seed, features, score = result
    print('Seed: ', seed)
    print('Features: ', features)
    print('Score: ', score) 

print()
print('Two Worst:')
for result in  two_worst:
    seed, features, score = result
    print('Seed: ', seed)
    print('Features: ', features)
    print('Score: ', score)

Two Best:
Seed:  100006
Features:  ['s3', 's2', 'bmi', 's4']
Score:  0.5467014178150216
Seed:  100000
Features:  ['sex', 's1', 's6', 's5']
Score:  0.5056316180205515

Two Worst:
Seed:  100009
Features:  ['s1', 's2', 'sex', 'sex']
Score:  0.2890811013860536
Seed:  100004
Features:  ['age', 's3', 'age', 's3']
Score:  0.3114362314974928
