In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
from IPython.display import Image
Image(filename="../input/images/Process flow.png", width= 900,height=900)

## 1. Load Libraries

In [None]:
# Data manipulation libraries
import pandas as pd
import numpy as np

##### Scikit Learn modules needed for Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder,MinMaxScaler , StandardScaler

# Plotting libraries
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes = True)
%matplotlib inline

## 2. Load Data

In [None]:
# Load data
df = pd.read_csv('../input/deodorant-instant-liking-data/Data_train_reduced.csv')
print(f"Shape of data: {df.shape}")
df.head()

## 3. Explore Data

In [None]:
print("Column names:-")
df.columns

In [None]:
print("Summary Stats of numberical columns:-")
df.describe(include=[np.number])

In [None]:
# Find columns with blank or nan values
def columns_with_na(dframe):
    temp = dframe.isna().sum()
    temp = temp[temp >0]
    print(f"Columns containing nan values:{temp.index}")
    return temp.index

columns_to_drop = list(columns_with_na(df))
# print(columns_to_drop)
# Drop rows with nan values
df.drop(columns_to_drop, axis='columns',inplace = True)
df.shape

In [None]:
# Explore data visually
# Build Correlation Matrix to study multi collinearity
correlation = df.iloc[:,3:10].corr()
#print(correlation)

fig , ax = plt.subplots()
fig.set_figwidth(16)
fig.set_figheight(16)
sns.heatmap(correlation,annot=True,cmap="YlGnBu")

In [None]:
classes = df["Instant.Liking"].value_counts()
classes.index = [str(x) for x in classes.index]

In [None]:
fig, ax = plt.subplots()

ax.bar(x = classes.index, height = classes)
#ax.barh(x = df['Gender'], height = np.mean(df.score))
ax.set_title('Count of Like and Dislike of Deodrant brand')
ax.set_xlabel('Like & Dislike')
ax.set_ylabel('Count')
for i, v in enumerate(classes):
    ax.text(i, v+2, s = int(v), color='blue', fontweight='bold')

## 4. Feauture Transformation

- Transform values to min max scaler
- As most of the data in columns is discrete we can get away with good results without scaling value as well

In [None]:
minmax = MinMaxScaler()
X_minmax = minmax.fit_transform(df.iloc[:,4:])
Y = df.iloc[:,3]

In [None]:
X_minmax, Y.head()

In [None]:
# Train & Test split
x_train, x_test, y_train, y_test = train_test_split(X_minmax,Y,test_size=0.20,
                                                    random_state=21)

print('Shape of Training Xs:{}'.format(x_train.shape))
print('Shape of Test Xs:{}'.format(x_test.shape))
print('Shape of Training y:{}'.format(y_train.shape))
print('Shape of Test y:{}'.format(y_test.shape))

In [None]:
# Build Model
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_predicted = clf.predict(x_test)
score=clf.score(x_test,y_test)

In [None]:
# Model Score
print(score)
print(y_predicted)

### Plot Confusion Matrix

In [None]:
#Confusion Matrix
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_predicted)
np.set_printoptions(precision=2)
cnf_matrix

In [None]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
#With Normalization
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=classes.index,
                      title='Confusion matrix, without normalization')
# With normalization
plt.figure()
plot_confusion_matrix(cnf_matrix, classes= classes.index, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

### Log Loss
- To study the prediction by chance of the mdoel.
- Output normalized valuerange between 0 to 1
- Closer to 0 indicates that true prediction by model is less likely by chance

In [None]:
from sklearn.metrics import log_loss

norm_log_loss = log_loss(Y, clf.predict_proba(X_minmax), normalize=True)
print(f"Normalized Log loss of the model: {norm_log_loss} ")

## Predict on separate Test data

In [None]:
validation_df = pd.read_csv("../input/deodorant-instant-liking-data/test_data.csv")
print(f"Size of data frame: {validation_df.shape}")
# validation_df.columns

In [None]:
# Drop columns which were not part of the orignal training data to maintain same feature set
columns_to_drop_valid = [col for col in validation_df.columns if col in columns_to_drop]
columns_to_drop_valid

In [None]:
validation_df.drop(columns_to_drop_valid, axis= "columns",inplace= True)

In [None]:
X_valid = minmax.transform(validation_df.iloc[:,3:])
print(f"Shape of scaled feature set: {X_valid.shape}")

In [None]:
y_predicted_valid = clf.predict(X_valid)

### Save Model to disk

In [None]:
import joblib
joblib.dump(clf,"model.joblib")

## Comments
- The score of 1.0 on prediction of 20% of sample can be attrubuted to large input feature set( 55 features) adding to the noise in the data.
- One may also implement Principal Compnent Analysis (PCA) to reduce feature space or  randomly droping few features at a time to verfiy impact on accuracy.