# Assignment: Marketing - Classifier

In [188]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [189]:
pd.options.mode.chained_assignment = None
# default='warn'

## Loading and Splitting the Data

In [190]:
# Load data
data = pd.read_csv("/content/Raisin_Dataset.csv")

# Define split ratio
train_size = 0.8

# Randomly shuffle data
data = data.sample(frac=1)

# Split data
split_index = int(len(data) * train_size)
train_data = data[:split_index]
test_data = data[split_index:]

# Save data to separate CSV files
train_data.to_csv("training_data.csv", index=False)
test_data.to_csv("testing_data.csv", index=False)

print("Data split completed! Training and testing data saved.")


Data split completed! Training and testing data saved.


## Training

In [191]:
# Download training data
df_training = pd.read_csv("/content/training_data.csv")
df_training

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,67883,383.189018,233.356673,0.793181,70360,0.697810,1030.869,0
1,52693,283.504239,242.113954,0.520265,54860,0.737749,895.745,0
2,83693,437.734856,245.424263,0.828040,87230,0.702860,1177.858,1
3,141559,594.114431,306.310002,0.856845,146674,0.764490,1530.598,1
4,84677,401.668142,269.668447,0.741121,86777,0.751029,1118.999,0
...,...,...,...,...,...,...,...,...
715,25387,225.629541,144.618672,0.767577,26139,0.678144,619.074,0
716,40861,249.740227,213.573272,0.518328,43096,0.743089,784.912,0
717,117301,507.240441,296.829882,0.810899,122361,0.618402,1407.401,1
718,66265,424.559435,203.381292,0.877793,70487,0.628015,1126.765,1


In [192]:
# Make sure data types are correct (no objects)
df_training.dtypes

Area                 int64
MajorAxisLength    float64
MinorAxisLength    float64
Eccentricity       float64
ConvexArea           int64
Extent             float64
Perimeter          float64
Class                int64
dtype: object

In [193]:
# Pop label
target = df_training.pop('Class')

In [194]:
# Define features
numeric_feature_names = ['Area', 'MajorAxisLength', 'Eccentricity' , 'ConvexArea', 'Extent' , 'Perimeter']
numeric_features = df_training[numeric_feature_names]
numeric_features.head()

Unnamed: 0,Area,MajorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter
0,67883,383.189018,0.793181,70360,0.69781,1030.869
1,52693,283.504239,0.520265,54860,0.737749,895.745
2,83693,437.734856,0.82804,87230,0.70286,1177.858
3,141559,594.114431,0.856845,146674,0.76449,1530.598
4,84677,401.668142,0.741121,86777,0.751029,1118.999


In [195]:
# Convert data to Tensorflow tensor
numeric_features = tf.convert_to_tensor(numeric_features)
numeric_features

<tf.Tensor: shape=(720, 6), dtype=float64, numpy=
array([[6.78830000e+04, 3.83189018e+02, 7.93181118e-01, 7.03600000e+04,
        6.97810444e-01, 1.03086900e+03],
       [5.26930000e+04, 2.83504239e+02, 5.20265279e-01, 5.48600000e+04,
        7.37749216e-01, 8.95745000e+02],
       [8.36930000e+04, 4.37734856e+02, 8.28040175e-01, 8.72300000e+04,
        7.02859542e-01, 1.17785800e+03],
       ...,
       [1.17301000e+05, 5.07240441e+02, 8.10899272e-01, 1.22361000e+05,
        6.18402185e-01, 1.40740100e+03],
       [6.62650000e+04, 4.24559435e+02, 8.77792635e-01, 7.04870000e+04,
        6.28014974e-01, 1.12676500e+03],
       [4.91750000e+04, 2.81431421e+02, 6.03110639e-01, 5.06720000e+04,
        7.27613045e-01, 8.45128000e+02]])>

In [196]:
# Normalize data
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(numeric_features)



In [197]:
# Define the model
def get_basic_model():
  model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(3) # Possible labels 0, 1 or 2
  ])

  model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                # loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=['accuracy'])
  return model

In [200]:
# Train model
BATCH_SIZE = 128
model = get_basic_model()
model.fit(numeric_features, target, epochs=3000, batch_size=BATCH_SIZE)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 502/3000
Epoch 503/3000
Epoch 504/3000
Epoch 505/3000
Epoch 506/3000
Epoch 507/3000
Epoch 508/3000
Epoch 509/3000
Epoch 510/3000
Epoch 511/3000
Epoch 512/3000
Epoch 513/3000
Epoch 514/3000
Epoch 515/3000
Epoch 516/3000
Epoch 517/3000
Epoch 518/3000
Epoch 519/3000
Epoch 520/3000
Epoch 521/3000
Epoch 522/3000
Epoch 523/3000
Epoch 524/3000
Epoch 525/3000
Epoch 526/3000
Epoch 527/3000
Epoch 528/3000
Epoch 529/3000
Epoch 530/3000
Epoch 531/3000
Epoch 532/3000
Epoch 533/3000
Epoch 534/3000
Epoch 535/3000
Epoch 536/3000
Epoch 537/3000
Epoch 538/3000
Epoch 539/3000
Epoch 540/3000
Epoch 541/3000
Epoch 542/3000
Epoch 543/3000
Epoch 544/3000
Epoch 545/3000
Epoch 546/3000
Epoch 547/3000
Epoch 548/3000
Epoch 549/3000
Epoch 550/3000
Epoch 551/3000
Epoch 552/3000
Epoch 553/3000
Epoch 554/3000
Epoch 555/3000
Epoch 556/3000
Epoch 557/3000
Epoch 558/3000
Epoch 559/3000
Epoch 560/3000
Epoch 561/3000
Epoch 562/3000
Epoch 563/3000
Epoch

<keras.src.callbacks.History at 0x7ef15b77e290>

## Testing

In [201]:
# Download test data
df_test = pd.read_csv("/content/testing_data.csv")
df_test.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,130388,511.897583,326.312449,0.770486,132727,0.719184,1393.974,1
1,87429,408.926028,273.887942,0.742565,89063,0.703575,1120.019,0
2,75368,387.852316,250.584522,0.763268,77706,0.735097,1078.409,1
3,56477,334.588138,219.18562,0.75555,58408,0.685667,941.429,0
4,45598,259.896265,225.65622,0.496118,47266,0.741286,813.276,0


In [202]:
# Pop label
test_target = df_test.pop('Class')

In [203]:
# Define test features
test_numeric_feature_names = ['Area', 'MajorAxisLength', 'Eccentricity' , 'ConvexArea', 'Extent' , 'Perimeter']
test_numeric_features = df_test[test_numeric_feature_names]
test_numeric_features.head()

Unnamed: 0,Area,MajorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter
0,130388,511.897583,0.770486,132727,0.719184,1393.974
1,87429,408.926028,0.742565,89063,0.703575,1120.019
2,75368,387.852316,0.763268,77706,0.735097,1078.409
3,56477,334.588138,0.75555,58408,0.685667,941.429
4,45598,259.896265,0.496118,47266,0.741286,813.276


In [204]:
# Convert test data to Tensorflow tensor
test_numeric_features = tf.convert_to_tensor(test_numeric_features)
test_numeric_features

<tf.Tensor: shape=(180, 6), dtype=float64, numpy=
array([[1.30388000e+05, 5.11897583e+02, 7.70486336e-01, 1.32727000e+05,
        7.19183673e-01, 1.39397400e+03],
       [8.74290000e+04, 4.08926028e+02, 7.42565182e-01, 8.90630000e+04,
        7.03574648e-01, 1.12001900e+03],
       [7.53680000e+04, 3.87852316e+02, 7.63267740e-01, 7.77060000e+04,
        7.35096754e-01, 1.07840900e+03],
       ...,
       [1.16361000e+05, 5.54490120e+02, 8.66210843e-01, 1.25132000e+05,
        7.47413046e-01, 1.43438900e+03],
       [6.60940000e+04, 3.23187716e+02, 5.89707296e-01, 6.70320000e+04,
        7.38719809e-01, 9.44713000e+02],
       [9.01910000e+04, 4.31366966e+02, 7.74727300e-01, 9.37190000e+04,
        6.25024255e-01, 1.17937400e+03]])>

In [205]:
# Normalize data
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(test_numeric_features)

In [206]:
# Evaluate test data based on training data
score = model.evaluate(test_numeric_features, test_target, verbose=1)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Test loss: 0.47966432571411133 / Test accuracy: 0.8833333253860474


In [207]:
# Save model (optional)
model.save('raisin_model')

## Prediction

In [208]:
# Download cases to be predicted
df_predict = pd.read_csv("/content/predict.csv")
df_predict

Unnamed: 0,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter
0,442.246011,253.291155,0.819738,90546,0.758651,1184.04
1,356.567808,253.254696,0.703943,71879,0.700929,1041.27
2,454.437216,236.964252,0.853285,82555,0.578256,1175.034
3,286.540559,208.760042,0.684989,47336,0.699599,844.162
4,352.19077,290.827533,0.564011,81463,0.792772,1073.251
5,318.125407,200.12212,0.777351,51368,0.658456,881.836
6,310.146072,176.131449,0.823099,43904,0.665894,823.796
7,332.455472,235.429835,0.706058,62329,0.743598,933.366
8,494.16296,244.803889,0.86867,97707,0.702386,1269.68
9,469.774755,238.539384,0.86149,92673,0.681044,1226.892


In [209]:
# Make sure data types are correct (no objects)
df_predict.dtypes

MajorAxisLength    float64
MinorAxisLength    float64
Eccentricity       float64
ConvexArea           int64
Extent             float64
Perimeter          float64
dtype: object

In [210]:
# Convert data to Tensorflow tensor
predict_numeric_features = tf.convert_to_tensor(df_predict)
predict_numeric_features

<tf.Tensor: shape=(13, 6), dtype=float64, numpy=
array([[4.42246011e+02, 2.53291155e+02, 8.19738392e-01, 9.05460000e+04,
        7.58650579e-01, 1.18404000e+03],
       [3.56567808e+02, 2.53254696e+02, 7.03942708e-01, 7.18790000e+04,
        7.00929152e-01, 1.04127000e+03],
       [4.54437216e+02, 2.36964252e+02, 8.53284574e-01, 8.25550000e+04,
        5.78255972e-01, 1.17503400e+03],
       [2.86540559e+02, 2.08760042e+02, 6.84989217e-01, 4.73360000e+04,
        6.99599385e-01, 8.44162000e+02],
       [3.52190770e+02, 2.90827533e+02, 5.64011330e-01, 8.14630000e+04,
        7.92771926e-01, 1.07325100e+03],
       [3.18125407e+02, 2.00122120e+02, 7.77351277e-01, 5.13680000e+04,
        6.58456354e-01, 8.81836000e+02],
       [3.10146072e+02, 1.76131449e+02, 8.23098681e-01, 4.39040000e+04,
        6.65893562e-01, 8.23796000e+02],
       [3.32455472e+02, 2.35429835e+02, 7.06057518e-01, 6.23290000e+04,
        7.43598190e-01, 9.33366000e+02],
       [4.94162960e+02, 2.44803889e+02, 8.68670

In [211]:
# Normalize
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(predict_numeric_features)

In [212]:
# Predict labels
class_names = ['Kecimen', 'Besni',]
predictions = model(predict_numeric_features, training=False)
# Create new columns in dataframe
df_predict['label'] = None
df_predict['certainty'] = None

for i, logits in enumerate(predictions):
  class_idx = tf.argmax(logits).numpy()
  p = tf.nn.softmax(logits)[class_idx]
  name = class_names[class_idx]
  print("Example {} prediction: {} ({:4.1f}%)".format(i, name, 100*p))

  # Save predictions to dataframe
  df_predict["label"].iloc[i] = name
  df_predict['certainty'].iloc[i] = format(p)

Example 0 prediction: Besni (100.0%)
Example 1 prediction: Besni (100.0%)
Example 2 prediction: Besni (100.0%)
Example 3 prediction: Besni (99.0%)
Example 4 prediction: Besni (100.0%)
Example 5 prediction: Besni (52.3%)
Example 6 prediction: Kecimen (92.6%)
Example 7 prediction: Besni (100.0%)
Example 8 prediction: Besni (100.0%)
Example 9 prediction: Besni (99.9%)
Example 10 prediction: Besni (100.0%)
Example 11 prediction: Besni (100.0%)
Example 12 prediction: Besni (100.0%)


In [213]:
df_predict

Unnamed: 0,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,label,certainty
0,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Besni,0.9998843669891356
1,356.567808,253.254696,0.703943,71879,0.700929,1041.27,Besni,0.9998830556869508
2,454.437216,236.964252,0.853285,82555,0.578256,1175.034,Besni,0.999811828136444
3,286.540559,208.760042,0.684989,47336,0.699599,844.162,Besni,0.9898930191993712
4,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Besni,0.9999998807907104
5,318.125407,200.12212,0.777351,51368,0.658456,881.836,Besni,0.523154616355896
6,310.146072,176.131449,0.823099,43904,0.665894,823.796,Kecimen,0.9255399107933044
7,332.455472,235.429835,0.706058,62329,0.743598,933.366,Besni,0.9999179840087892
8,494.16296,244.803889,0.86867,97707,0.702386,1269.68,Besni,0.9999618530273438
9,469.774755,238.539384,0.86149,92673,0.681044,1226.892,Besni,0.9993539452552797


## Reflection

**Dataset Description**

*   The dataset provides a set of features that describe various physical properties of raisins. These features allows for a classification model that can differentiate between the two classes (i.e Kecimen and Besni) based on measurable attributes.

**Practical Application**

By classifying raisins, the model can be used in quality control processes. For instance, it can help in sorting and grading raisins based on their physical characteristics, ensuring consistency and quality in the final product.

**Communicating to a Marketing Manager**

To effectively communicate the benefits of classification model to a Marketing Manager, I would take the following actions:

*   **Highlight Product Quality Control:** I would explain that the model can automatically sort and grade raisins based on physical characteristics such as area, axis lengths, and eccentricity. This automation ensures consistent high quality.

*   **Emphasize Targeted Marketing Campaigns:** I would describe how the model helps identify key features of different raisin types. This allows for the creation of highly targeted marketing campaigns, promoting specific raisin types to customer segments that prefer those attributes.

*   **Discuss Data-Driven Decisions:** I would convey that the insights from the classification model provide valuable data on product characteristics. This data can be used to make informed decisions on product development, inventory management, and market segmentation.

By taking these steps, I would ensure that the Marketing Manager understands the practical benefits of the classification model.





