In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

The data contains the following fields:

Header Description

chocolate -> Does it contain chocolate?

fruity -> Is it fruit flavored?

caramel -> Is there caramel in the candy?

peanutalmondy -> Does it contain peanuts, peanut butter or almonds?

nougat -> Does it contain nougat?

crispedricewafer -> Does it contain crisped rice, wafers, or a cookie component?

hard -> Is it a hard candy?

bar- > Is it a candy bar?

pluribus -> Is it one of many candies in a bag or box?

sugarpercent -> The percentile of sugar it falls under within the data set.

pricepercent -> The unit price percentile compared to the rest of the set.

winpercent -> The overall win percentage according to 269,000 matchups.

In [None]:
candy = pd.read_csv('/kaggle/input/the-ultimate-halloween-candy-power-ranking/candy-data.csv')
candy.head(10)

In [None]:
candy.info()

In [None]:
#missing data
candy.isnull().sum()

## Exploratory Data Analysis

In [None]:
#filling candy
fillings = ["chocolate", "fruity", "caramel", "peanutyalmondy", "nougat", "crispedricewafer"]

def count(inside):
    sns.countplot(x=inside, data=candy)
    
fig, ax = plt.subplots(2,3, figsize=(12,10))
for key, value in enumerate(fillings,1):
    plt.subplot(2,3,key)
    count(value)


In [None]:
var = ['sugarpercent', 'pricepercent', 'winpercent']

def box(inside,var,c):
    sns.boxplot(x=inside, y=var, data=candy, palette=c)
    sns.swarmplot(x=inside, y=var, data=candy, color='.25')
    plot_title_str = 'Distribution of win percentage by {}'.format(inside)
    plt.title(plot_title_str)
    
fig, ax = plt.subplots(1,3,figsize=(15,5))
for k, v in enumerate(var,1):
    plt.subplot(1,3,k)
    box("chocolate", v, "magma")
    
fig, ax = plt.subplots(1,3, figsize=(15,5))
for k, v in enumerate(var,1):
    plt.subplot(1,3,k)
    box("fruity", v, "coolwarm")

In [None]:
candy = candy.sort_values(by="winpercent", ascending=False)

plt.figure(figsize=(16,16))
sns.barplot(y=candy["competitorname"], x=candy["winpercent"])

# Regression Classification

### Can you predict if a candy is bar or not based on its other features?

In [None]:
X = candy.drop(['competitorname','hard'], axis=1)
y = candy['hard'].values

In [None]:
#Preprocessing
X = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)
X

In [None]:
#split datas in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

### 1.Logistic regression

In [None]:
model_LR = LogisticRegression()
model_LR.fit(X_train,y_train)

print("ModelAccuracy:", model_LR.score(X_test,y_test))

In [None]:
y_pred_LR = model_LR.predict(X_test)
confusion_matrix(y_test, y_pred_LR)

In [None]:
print(classification_report(y_test,y_pred_LR))

In [None]:
logit_roc_auc = roc_auc_score(y_test, model_LR.predict(X_test))

fpr, tpr, thresholds = roc_curve(y_test, model_LR.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('LR-ROC')
plt.show()

### 2.K-Neighbors

In [None]:
model_KNN = KNeighborsClassifier()
model_KNN.fit(X_train,y_train)
print("Model Accuracy:", model_KNN.score(X_test,y_test))

In [None]:
y_pred_KNN = model_KNN.predict(X_test)
confusion_matrix(y_test,y_pred_KNN)

In [None]:
print(classification_report(y_test,y_pred_KNN))

In [None]:
KNN_roc_auc = roc_auc_score(y_test,model_KNN.predict(X_test))

fpr1, tpr1, thresholds1 = roc_curve(y_test,model_KNN.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr1, tpr1, label='AUC (area = %0.2f)' % KNN_roc_auc)
plt.plot([0,1],[0,1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('KNN-ROC')
plt.show()
