In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso,Ridge
from sklearn.metrics import mean_squared_error,r2_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,RocCurveDisplay,auc,precision_recall_fscore_support,precision_score
import joblib 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,Binarizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from feature_engine.wrappers import SklearnTransformerWrapper as TransformerWrapper



In [13]:
data = pd.read_csv("candy-data.csv")

# remove competitorname feature 
data.drop(columns='competitorname',inplace=True)


# We start by separating the set into train and test set 
x_train,x_test,y_train,y_test = train_test_split(data.drop(["chocolate"],inplace=False,axis=1),data["chocolate"],test_size=0.2,random_state=0,stratify=data['chocolate'])

# reset index 
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)
x_train.reset_index(drop=True,inplace=True)
x_test.reset_index(drop=True,inplace=True)



In [15]:
print(y_train.shape)
print(y_test.shape )
print(y_train.head(2))
print(y_test.head(2))
print(x_train.head(2))
print(x_test.head(2))


(68,)
(17,)
0    0
1    1
Name: chocolate, dtype: int64
0    0
1    0
Name: chocolate, dtype: int64
   fruity  caramel  peanutyalmondy  nougat  crispedricewafer  hard  bar  \
0       0        1               0       0                 0     0    0   
1       0        0               0       0                 0     0    0   

   pluribus  sugarpercent  pricepercent  winpercent  
0         0         0.418         0.325   32.230995  
1         1         0.825         0.651   66.574585  
   fruity  caramel  peanutyalmondy  nougat  crispedricewafer  hard  bar  \
0       1        0               0       0                 0     1    0   
1       1        0               0       0                 0     0    0   

   pluribus  sugarpercent  pricepercent  winpercent  
0         0         0.732         0.034   39.460556  
1         1         0.069         0.116   52.825947  


In [21]:
CONT_VARS = [ var for var in x_train.columns if len(x_train[var].unique()) > 20   ]
CONT_VARS

['sugarpercent', 'pricepercent', 'winpercent']

In [36]:
# continuous variables
pipe = Pipeline([   

    ( 'featuresel', SelectFromModel (estimator=LinearSVC(C=0.04,verbose=3,random_state=0),max_features=len(x_train.columns))) ,
    ('scaler', MinMaxScaler()),
    ('model',LinearSVC(C=0.04,verbose=1,random_state=0) )
])

In [37]:
pipe.fit(x_train,y_train)
y_train_pred = pipe.predict(x_train)
y_test_pred = pipe.predict(x_test)


[LibLinear][LibLinear]



In [38]:
# we can now evaluate our model 
prec,recall,fscore,_ = precision_recall_fscore_support(y_test,y_test_pred,pos_label=1,average='binary')
print(recall)
print(prec)
print(fscore)


1.0
0.7
0.8235294117647058


In [35]:
# we can access the parameters of every transformer using the named_steps[name_of_transformer]. 
# in our pipeline 
#print(pipe.named_steps['featuresel'].get_support())

[ True False False False False False  True  True False False False]
