In [None]:
# Welcome to the Emojify Challenge!

In [None]:
##################################################
# Imports
##################################################

import numpy as np
import cv2
import os
import pandas as pd
import matplotlib.pyplot as plt
import emoji
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegressionCV
from sklearn import preprocessing
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_precision_recall_curve
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import learning_curve
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

##################################################
# Params
##################################################

DATA_BASE_FOLDER = '/kaggle/input/emojify-challenge'


##################################################
# Utils
##################################################

def label_to_emoji(label):
    """
    Converts a label (int or string) into the corresponding emoji code (string) ready to be printed
    """
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

# Dataset

In [None]:
##################################################
# Load dataset
##################################################

df_train = pd.read_csv(os.path.join(DATA_BASE_FOLDER, 'train.csv'))
y_train = df_train['class']
df_validation = pd.read_csv(os.path.join(DATA_BASE_FOLDER, 'validation.csv'))
y_validation = df_validation['class']
emoji_dictionary = {
    '0': '\u2764\uFE0F',
    '1': ':baseball:',
    '2': ':smile:',
    '3': ':disappointed:',
    '4': ':fork_and_knife:'
}

# See some data examples
print('EXAMPLES:\n####################')
for idx in range(10):
    print(f'{df_train["phrase"][idx]} -> {label_to_emoji(y_train[idx])}')

# Word embeddings

Words can be represented as n-dimentional vectors where the distance between points has a correspondence respect to similarity between word semantics (similar words are closer, while dissimilar ones are distant). This representation is known as word embeddings and here is extrapolated and pre-computed from the [GloVe](https://nlp.stanford.edu/projects/glove/) model. 

Here is depicted an example of bi-dimensional word embeddings:
![word embedding](https://shanelynnwebsite-mid9n9g1q9y8tt.netdna-ssl.com/wp-content/uploads/2018/01/word-vector-space-similar-words.jpg)

In our case a single word is represented by a vector of length 25.

# Phrase representation

All the phrases are padded to the phrase of maximum length, in this case `max_len = 10`, and each phrase is represented by the concatenation of his word embeddings (each phrase thus is a 10 * 25 = 250 dimentional vector).

In [None]:
# Load phrase representation
x_train = np.load(
    os.path.join(DATA_BASE_FOLDER, 
                 'train.npy')).reshape(len(df_train), -1)
x_validation = np.load(
    os.path.join(DATA_BASE_FOLDER, 
                 'validation.npy')).reshape(len(df_validation), -1)
print(f'Word embedding size: {x_train.shape[-1]}')

exploratory data analysis

In [None]:
x_train.shape()

first of all we want see if the dataset are balenced between the classes

In [None]:
df_train['class'].plot(kind = 'hist')
plt.show()

as we see in the graph we work with unbalanced data then for evaluate our model we use precision-recall and mAP because are usually most robust.

Preprocessing data 

Standardization our dataset

In [None]:
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)

# Model

Here you have to implement a model (or more models, for finding the most accurate) for classification.

You can use the sklearn (or optionally other more advanced frameworks such as pytorch or tensorflow) package that contains a pool of models already implemented that perform classification. (SVMs, NNs, LR, kNN, ...)

logistic regression
implementation:

In [None]:
lr_classification = LogisticRegression()
lr_fit = lr_classification.fit(x_train, y_train)
y_pred = lr_fit.predict(x_validation)
print(y_pred)
print(accuracy(y_pred, y_validation))
print(classification_report(y_pred, y_validation))

now we want to optimize our logistic regression function

In [None]:
from sklearn.model_selection import GridSearchCV
params={
    'C':[0.01, 0.05, 0.1, 0.5, 1],
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'fit_intercept' : [True, False],
    'class_weight': ['balanced', None],
    'multi_class': ['auto', 'ovr', 'multinomial']
}
lr = LogisticRegression()
grid_search_lr = GridSearchCV(estimator=lr, param_grid= params)
grid_search_lr.fit(x_train, y_train)

In [None]:
lrc_opt = LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False)
lr_fit_opt=lrc_opt.fit(x_train, y_train)
y_pred_opt = lr_fit_opt.predict(x_validation)
print(y_pred_opt)
print(accuracy(y_pred_opt, y_validation))
print(classification_report(y_pred_opt, y_validation))


In [None]:
#logistic regression cross validation
for c in (5, 10, 15):
    lrc_CV = LogisticRegressionCV(cv=c, penalty='l2', fit_intercept=False, multi_class='ovr', class_weight = 'balanced')
    lrc_CV.fit(x_train, y_train)
    y_pred = lrc_CV.predict(x_validation)
    print(y_pred)
    print(accuracy(y_pred, y_validation))
    print(classification_report(y_pred, y_validation))

now that we have tried with the parameters we try to normalize the trainin data and see what happen

In [None]:
#apply to our best model
lrc_opt.fit(x_train_scaled, y_train)
y_pred_scaled = lrc_opt.predict(x_validation)
print(y_pred_scaled)
print(accuracy(y_pred_scaled, y_validation))
print(classification_report(y_pred_scaled, y_validation))

now we want to evaluate our model:

confusion matrix

In [None]:
plot_confusion_matrix(lr_fit, x_validation, y_validation)
plot_confusion_matrix(lr_fit_opt, x_validation, y_validation)

precision and recall curve

In [None]:
#precision recall curve
y_pred_prob = lr_classification.predict_proba(x_validation)
y_validation_bin = label_binarize(y_validation, classes=[0, 1, 2, 3, 4])
precision = dict()
recall = dict()
for i in range(5):
    precision[i], recall[i], _ = precision_recall_curve(y_validation_bin[:, i], y_pred_prob[:, i])
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve")
plt.show()

y_pred_prob_opt = lrc_opt.predict_proba(x_validation)
precision = dict()
recall = dict()
for i in range(5):
    precision[i], recall[i], _ = precision_recall_curve(y_validation_bin[:, i], y_pred_prob_opt[:, i])
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve opt")
plt.show()

ROC curve

In [None]:
tpr = dict()
fpr = dict()
for i in range(5):
    fpr[i], tpr[i], _ = roc_curve(y_validation_bin[:, i], y_pred_prob[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend(loc="best")
plt.title("ROC curve")
plt.show()

tpr_opt = dict()
fpr_opt = dict()
for i in range(5):
    fpr_opt[i], tpr_opt[i], _ = roc_curve(y_validation_bin[:, i], y_pred_prob_opt[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend(loc="best")
plt.title("ROC curve opt")
plt.show()


diagnosis of bias and variance with learning curve

In [None]:
train_size, train_score,valid_score =learning_curve(lr_classification, x_train, y_train)
train_mean = np.mean(train_score, axis=1)
train_std = np.std(train_score, axis=1)
test_mean = np.mean(valid_score, axis=1)
test_std = np.std(valid_score, axis=1)

#plot
plt.plot(train_size, train_mean, label="training score")
plt.plot(train_size, test_mean, label = "validation score")
plt.title("Learning Curve")
plt.xlabel("training size")
plt.ylabel("score")
plt.legend(loc="best")
plt.show()

train_size_opt, train_score_opt,valid_score_opt =learning_curve(lrc_opt, x_train, y_train)
train_mean_opt = np.mean(train_score, axis=1)
train_std_opt = np.std(train_score, axis=1)
test_mean_opt = np.mean(valid_score, axis=1)
test_std_opt = np.std(valid_score, axis=1)

#plot
plt.plot(train_size_opt, train_mean_opt, label="training score")
plt.plot(train_size_opt, test_mean_opt, label = "validation score")
plt.title("Learning Curve opt")
plt.xlabel("training size")
plt.ylabel("score")
plt.legend(loc="best")
plt.show()

support vector machine 
implementation:

In [None]:
svm_classifier = SVC(probability=True)
svm_fit = svm_classifier.fit(x_train, y_train)
y_pred = svm_fit.predict(x_validation)
print(y_pred)
print(accuracy(y_pred, y_validation))
print(classification_report(y_pred, y_validation))
    

optimization:

In [None]:
#now we tried with the other kernel coefficient
for k in ('linear', 'poly', 'rbf', 'sigmoid'):
    for c in (0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.8, 1, 10, 100):
        for cw in (None, 'balanced'):
            for g in ('scale', 'auto'):
                svm_classifier = SVC(C=c, kernel=k, gamma=g, class_weight=cw, probability=True)
                svm_classifier.fit(x_train, y_train)
                y_pred_param = svm_classifier.predict(x_validation)
                print(f'k = {k} c = {c} cw ={cw} gamma = {g} accuracy={accuracy(y_pred_param, y_validation)}  ')
    

we can conclude that our best svm model have kernel='rbf', gamma='auto', c=10, cw=None

In [None]:
svmc_opt = SVC(C=10, kernel='rbf', gamma='auto', probability=True)
svm_fit_opt = svmc_opt.fit(x_train, y_train)
y_pred_opt = svm_fit_opt.predict(x_validation)
print(y_pred_opt)
print(accuracy(y_pred_opt, y_validation))
print(classification_report(y_pred_opt, y_validation))

In [None]:
#apply to our best model
svmc_opt.fit(x_train_scaled, y_train)
y_pred_scaled = svmc_opt.predict(x_validation)
print(y_pred_scaled)
print(accuracy(y_pred_scaled, y_validation))
print(classification_report(y_pred_scaled, y_validation))

evaluation of the model:

confusion matrix 

In [None]:
#confusion matrix
plot_confusion_matrix(svm_fit, x_validation, y_validation)
plot_confusion_matrix(svm_fit_opt, x_validation, y_validation)


precision recall curve

In [None]:
#normail svm classifier
#precision recall curve
y_pred_prob = svm_classifier.predict_proba(x_validation)
y_validation_bin = label_binarize(y_validation, classes=[0, 1, 2, 3, 4])
precision = dict()
recall = dict()
for i in range(5):
    precision[i], recall[i], _ = precision_recall_curve(y_validation_bin[:, i], y_pred_prob[:, i])
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve")
plt.show()

#optimize svm classifier
y_pred_prob_opt = svmc_opt.predict_proba(x_validation)
precision = dict()
recall = dict()
for i in range(5):
    precision[i], recall[i], _ = precision_recall_curve(y_validation_bin[:, i], y_pred_prob_opt[:, i])
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve opt")
plt.show()

ROC curve 

In [None]:
#normal svm classifier
tpr = dict()
fpr = dict()
for i in range(5):
    fpr[i], tpr[i], _ = roc_curve(y_validation_bin[:, i], y_pred_prob[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend(loc="best")
plt.title("ROC curve")
plt.show()

#optimize svm classifer
tpr = dict()
fpr = dict()
for i in range(5):
    fpr[i], tpr[i], _ = roc_curve(y_validation_bin[:, i], y_pred_prob_opt[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend(loc="best")
plt.title("ROC curve opt")
plt.show()

learning curve

In [None]:
#normal svm classifier
train_size, train_score,valid_score =learning_curve(svm_classifier, x_train, y_train)
train_mean = np.mean(train_score, axis=1)
train_std = np.std(train_score, axis=1)
test_mean = np.mean(valid_score, axis=1)
test_std = np.std(valid_score, axis=1)

#plot
plt.plot(train_size, train_mean, label="training score")
plt.plot(train_size, test_mean, label = "validation score")
plt.title("Learning Curve")
plt.xlabel("training size")
plt.ylabel("score")
plt.legend(loc="best")
plt.show()

#optimize svm classifier
train_size_opt, train_score_opt,valid_score_opt =learning_curve(svmc_opt, x_train, y_train)
train_mean_opt = np.mean(train_score_opt, axis=1)
train_std_opt = np.std(train_score_opt, axis=1)
test_mean_opt = np.mean(valid_score_opt, axis=1)
test_std_opt = np.std(valid_score_opt, axis=1)

#plot
plt.plot(train_size, train_mean, label="training score")
plt.plot(train_size, test_mean, label = "validation score")
plt.title("Learning Curve opt")
plt.xlabel("training size")
plt.ylabel("score")
plt.legend(loc="best")
plt.show()


knn
implementation:

In [None]:
knn = KNeighborsClassifier(n_neighbors= 5)
knn_fit = knn.fit(x_train, y_train)
y_pred = knn.predict(x_validation)
print(y_pred)
print(accuracy(y_pred,y_validation))
print(classification_report(y_pred, y_validation))

optimization:
we try all the different paramaeters to find the best combination


In [None]:
#now we try with different algorithm
for a in ('auto', 'ball_tree', 'kd_tree', 'brute'):
    for w in ('uniform', 'distance'):
        for nn in (1, 3, 5, 10):
            knn = KNeighborsClassifier(n_neighbors = nn, weights=w, algorithm=a)
            knn.fit(x_train, y_train)
            y_pred_param = knn.predict(x_validation)
            print(f'a = {a} w = {w} n = {nn} accuracy={accuracy(y_pred_param, y_validation)}')

our model with the best parameter is with algorithm = 'auto'(default), n_neighbors = 3, weight = 'distance' 

In [None]:
knn_opt = KNeighborsClassifier(n_neighbors= 3, algorithm='auto', weights = 'distance')
knn_fit_opt=knn_opt.fit(x_train, y_train)
y_pred_opt = knn_opt.predict(x_validation)
print(y_pred_opt)
print(accuracy(y_pred_opt,y_validation))
print(classification_report(y_pred_opt, y_validation))

Now we try our best model with the normalization training set

In [None]:
knn_opt.fit(x_train_scaled, y_train)
y_pred_opt_scaled = knn_opt.predict(x_validation)
print(y_pred_opt_scaled)
print(accuracy(y_pred_opt_scaled,y_validation))

didn't improve our accuracy performance.

evaluation:
confusion matrix

In [None]:
plot_confusion_matrix(knn_fit, x_validation, y_validation)
plot_confusion_matrix(knn_fit_opt, x_validation, y_validation)

precision and recall curve

In [None]:
#normal KNN
y_pred_prob = knn.predict_proba(x_validation)
y_validation_bin = label_binarize(y_validation, classes=[0, 1, 2, 3, 4])
precision = dict()
recall = dict()
for i in range(5):
    precision[i], recall[i], _ = precision_recall_curve(y_validation_bin[:, i], y_pred_prob[:, i])
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve")
plt.show()

#optimize KNN
y_pred_prob_opt = knn_opt.predict_proba(x_validation)
precision = dict()
recall = dict()
for i in range(5):
    precision[i], recall[i], _ = precision_recall_curve(y_validation_bin[:, i], y_pred_prob_opt[:, i])
    plt.plot(recall[i], precision[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc="best")
plt.title("precision vs. recall curve optimize")
plt.show()

ROC curve

In [None]:
#normal KNN
tpr = dict()
fpr = dict()
for i in range(5):
    fpr[i], tpr[i], _ = roc_curve(y_validation_bin[:, i], y_pred_prob[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend(loc="best")
plt.title("ROC curve")
plt.show()

#optimize KNN
tpr = dict()
fpr = dict()
for i in range(5):
    fpr[i], tpr[i], _ = roc_curve(y_validation_bin[:, i], y_pred_prob_opt[:, i])
    plt.plot(fpr[i], tpr[i], lw=2, label='class {}'.format(i))
    
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend(loc="best")
plt.title("ROC curve optimize")
plt.show()

learning curve

In [None]:
#normal KNN
train_size, train_score,valid_score =learning_curve(knn, x_train, y_train)
train_mean = np.mean(train_score, axis=1)
train_std = np.std(train_score, axis=1)
test_mean = np.mean(valid_score, axis=1)
test_std = np.std(valid_score, axis=1)

#plot
plt.plot(train_size, train_mean, label="training score")
plt.plot(train_size, test_mean, label = "validation score")
plt.title("Learning Curve")
plt.xlabel("training size")
plt.ylabel("score")
plt.legend(loc="best")
plt.show()

#optimize KNN
train_size_opt, train_score_opt,valid_score_opt =learning_curve(knn_opt, x_train, y_train)
train_mean_opt = np.mean(train_score_opt, axis=1)
train_std_opt = np.std(train_score_opt, axis=1)
test_mean_opt = np.mean(valid_score_opt, axis=1)
test_std_opt = np.std(valid_score_opt, axis=1)

#plot
plt.plot(train_size_opt, train_mean_opt, label="training score")
plt.plot(train_size_opt, test_mean_opt, label = "validation score")
plt.title("Learning Curve")
plt.xlabel("training size")
plt.ylabel("score")
plt.legend(loc="best")
plt.show()

# Evaluation

In [None]:
##################################################
# Evaluate the model here
##################################################

# Use this function to evaluate your model
def accuracy(y_pred, y_true):
    '''
    input y_pred: ndarray of shape (N,)
    input y_true: ndarray of shape (N,)
    '''
    return (1.0 * (y_pred == y_true)).mean()

# Report the accuracy in the train and validation sets.








# Send the submission for the challenge

In [None]:
##################################################
# Save your test prediction in y_test_pred
##################################################

y_test_pred = None

# Create submission
submission = pd.read_csv(os.path.join(DATA_BASE_FOLDER, 'sample_submission.csv'))
x_test = np.load(os.path.join(DATA_BASE_FOLDER, 'test.npy')).reshape(len(submission), -1)
if y_test_pred is not None:
    submission['class'] = y_test_pred
submission.to_csv('my_submission.csv', index=False)