In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model

## Predict Scent Quality Score 
Predicting the Scent Quality Score for any new perfume based on the quantity of scent chemicals used.

To predict the Scent Quality Score, we will use linear regression.

In [30]:
df_score = pd.read_csv("perfume-score.csv")
df_preference = pd.read_csv("perfume-preference.csv")

In [23]:
df_score.head()

Unnamed: 0,Narcissus,Agrumen,Oud,Jasmine,Amber,Neroli,Indole,Vanilla,Frankincense,Bergamot,Galbanum,Magnolia,Sandalwood,Cashmeran,Citron,Opopanax,Aliphatic Aldehydes,Vetiver,Scent Quality Score
0,489.766,343.51,638.519,315.377,966.417,913.256,1015.036,479.027,485.797,2918.050062,108.538,727.438,936.842,4801.306119,261.952,148.593,783.264,809.541,13027000.0
1,472.841,218.288,642.332,210.582,995.068,989.447,958.614,507.113,242.015,2119.07484,246.654,755.477,840.936,4896.31559,149.498,44.49,906.204,815.512,11590730.0
2,472.62,323.48,696.77,288.379,1006.334,875.163,987.398,611.463,410.451,2679.139347,281.022,729.155,825.386,5350.521973,177.98,141.612,705.294,794.394,13676930.0
3,503.155,397.632,644.533,151.414,960.097,905.462,1031.227,469.357,388.405,1784.035393,280.953,711.906,786.198,5029.939322,29.515,149.231,678.681,837.614,7997427.0
4,499.78,344.096,643.764,353.518,1033.988,978.976,871.312,439.266,311.002,3236.214279,272.058,737.003,898.238,4988.788504,138.884,122.238,622.09,824.174,11132900.0


In [24]:
column_names = list(df_score.columns)
print(column_names)

['Narcissus', 'Agrumen', 'Oud', 'Jasmine', 'Amber', 'Neroli', 'Indole', 'Vanilla', 'Frankincense', 'Bergamot', 'Galbanum', 'Magnolia', 'Sandalwood', 'Cashmeran', 'Citron', 'Opopanax', 'Aliphatic Aldehydes', 'Vetiver', 'Scent Quality Score']


Now, use linear regression. The x are the chemicals and y is the Scent Quality Score

In [25]:
x = df_score[['Narcissus', 'Agrumen', 'Oud', 'Jasmine', 'Amber', 'Neroli', 'Indole',
       'Vanilla', 'Frankincense', 'Bergamot', 'Galbanum', 'Magnolia',
       'Sandalwood', 'Cashmeran', 'Citron', 'Opopanax', 'Aliphatic Aldehydes',
       'Vetiver']]
y = df_score['Scent Quality Score']
regr = linear_model.LinearRegression()
regr.fit(x, y)

LinearRegression()

Now, we could use 
```
regr.predict()
```
to predict the Scent Quality Score.

The following is one example.

In [26]:
regr.predict([[1353,1252,4066,3838,2144,4404,32082,3866,2505,3972,4485,6441,4106,1722,4287,4820,4140,1463]])



array([1.24290148e+08])

## Understanding if any scent chemicals are strongly related
We will check the coeffients to understand the related chemicals.

All chemicals with positive coefficient will have similar positive response
All chemicals with negative coefficient will have similar negative response

In [27]:
regr.coef_

array([ 3.94491376e+03,  2.08173923e+03,  1.30765741e+02, -1.82721593e+02,
        7.76550047e+02,  1.59809609e+00,  3.58225567e+01,  1.54745736e+04,
        7.93763594e+01,  4.01216462e+03,  6.73005407e+02,  2.38734209e+02,
       -1.35336339e+01, -1.01948466e+01,  1.44698553e+02,  2.21238119e+01,
        1.24653485e+04,  2.56996386e+02])

In [28]:
positive_coef_columns = [column_names[i] for i in range(len(regr.coef_)) if regr.coef_[i] > 0]
print(positive_coef_columns)

['Narcissus', 'Agrumen', 'Oud', 'Amber', 'Neroli', 'Indole', 'Vanilla', 'Frankincense', 'Bergamot', 'Galbanum', 'Magnolia', 'Citron', 'Opopanax', 'Aliphatic Aldehydes', 'Vetiver']


In [29]:
negative_coef_columns = [column_names[i] for i in range(len(regr.coef_)) if regr.coef_[i] < 0]
print(negative_coef_columns)

['Jasmine', 'Sandalwood', 'Cashmeran']


## Understanding if there are specific ‘types’ of customer who prefer specific kinds of mixture

We will use the linear regression model to predict the Scent Quality Score for each custer's preference.

In [31]:
df_preference.head()

Unnamed: 0,Customer No.,Narcissus,Agrumen,Oud,Jasmine,Amber,Neroli,Indole,Vanilla,Frankincense,Bergamot,Galbanum,Magnolia,Sandalwood,Cashmeran,Citron,Opopanax,Aliphatic Aldehydes,Vetiver
0,C_0000001,1353.0,1252.0,4066.0,3838.0,2144.0,4404.0,32082,3866.0,2505.0,3972.0,4485.0,6441,4106.0,1722.0,4287.0,4820.0,4140.0,1463.0
1,C_0000002,1089.0,2152.0,4045.0,3710.0,2235.0,4352.0,30398,4769.0,2995.0,4720.0,4532.0,10931,3794.0,1638.0,4648.0,4472.0,4184.0,1071.0
2,C_0000003,4177.0,3592.0,3596.0,1745.0,3234.0,2116.0,21678,4864.0,3178.0,3381.0,1376.0,18153,2502.0,1733.0,1747.0,2728.0,4580.0,4742.0
3,C_0000004,4899.0,3738.0,2454.0,3976.0,4945.0,3853.0,17963,3040.0,2943.0,2870.0,4016.0,18819,1990.0,5118.0,2391.0,2012.0,3470.0,3057.0
4,C_0000005,4822.0,4030.0,3447.0,4225.0,4078.0,3772.0,23988,3389.0,2415.0,2695.0,3887.0,20367,2118.0,4530.0,2427.0,3205.0,4319.0,2289.0
